X86ISelLowering.cpp revision 59d3ae6cdc4316ad338cd848251f33a236ccb36c
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86ISelLowering.h" 17#include "Utils/X86ShuffleDecode.h" 18#include "X86.h" 19#include "X86CallingConv.h" 20#include "X86InstrBuilder.h" 21#include "X86TargetMachine.h" 22#include "X86TargetObjectFile.h" 23#include "llvm/ADT/SmallSet.h" 24#include "llvm/ADT/Statistic.h" 25#include "llvm/ADT/StringExtras.h" 26#include "llvm/ADT/VariadicFunction.h" 27#include "llvm/CodeGen/IntrinsicLowering.h" 28#include "llvm/CodeGen/MachineFrameInfo.h" 29#include "llvm/CodeGen/MachineFunction.h" 30#include "llvm/CodeGen/MachineInstrBuilder.h" 31#include "llvm/CodeGen/MachineJumpTableInfo.h" 32#include "llvm/CodeGen/MachineModuleInfo.h" 33#include "llvm/CodeGen/MachineRegisterInfo.h" 34#include "llvm/IR/CallingConv.h" 35#include "llvm/IR/Constants.h" 36#include "llvm/IR/DerivedTypes.h" 37#include "llvm/IR/Function.h" 38#include "llvm/IR/GlobalAlias.h" 39#include "llvm/IR/GlobalVariable.h" 40#include "llvm/IR/Instructions.h" 41#include "llvm/IR/Intrinsics.h" 42#include "llvm/IR/LLVMContext.h" 43#include "llvm/MC/MCAsmInfo.h" 44#include "llvm/MC/MCContext.h" 45#include "llvm/MC/MCExpr.h" 46#include "llvm/MC/MCSymbol.h" 47#include "llvm/Support/CallSite.h" 48#include "llvm/Support/Debug.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Target/TargetOptions.h" 52#include <bitset> 53#include <cctype> 54using namespace llvm; 55 56STATISTIC(NumTailCalls, "Number of tail calls"); 57 58// Forward declarations. 59static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, 60 SDValue V2); 61 62static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, 63 SelectionDAG &DAG, SDLoc dl, 64 unsigned vectorWidth) { 65 assert((vectorWidth == 128 || vectorWidth == 256) && 66 "Unsupported vector width"); 67 EVT VT = Vec.getValueType(); 68 EVT ElVT = VT.getVectorElementType(); 69 unsigned Factor = VT.getSizeInBits()/vectorWidth; 70 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 71 VT.getVectorNumElements()/Factor); 72 73 // Extract from UNDEF is UNDEF. 74 if (Vec.getOpcode() == ISD::UNDEF) 75 return DAG.getUNDEF(ResultVT); 76 77 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR 78 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); 79 80 // This is the index of the first element of the vectorWidth-bit chunk 81 // we want. 82 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) 83 * ElemsPerChunk); 84 85 // If the input is a buildvector just emit a smaller one. 86 if (Vec.getOpcode() == ISD::BUILD_VECTOR) 87 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, 88 Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk); 89 90 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 91 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 92 VecIdx); 93 94 return Result; 95 96} 97/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 98/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 99/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 100/// instructions or a simple subregister reference. Idx is an index in the 101/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes 102/// lowering EXTRACT_VECTOR_ELT operations easier. 103static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, 104 SelectionDAG &DAG, SDLoc dl) { 105 assert((Vec.getValueType().is256BitVector() || 106 Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); 107 return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); 108} 109 110/// Generate a DAG to grab 256-bits from a 512-bit vector. 111static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, 112 SelectionDAG &DAG, SDLoc dl) { 113 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); 114 return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); 115} 116 117static SDValue InsertSubVector(SDValue Result, SDValue Vec, 118 unsigned IdxVal, SelectionDAG &DAG, 119 SDLoc dl, unsigned vectorWidth) { 120 assert((vectorWidth == 128 || vectorWidth == 256) && 121 "Unsupported vector width"); 122 // Inserting UNDEF is Result 123 if (Vec.getOpcode() == ISD::UNDEF) 124 return Result; 125 EVT VT = Vec.getValueType(); 126 EVT ElVT = VT.getVectorElementType(); 127 EVT ResultVT = Result.getValueType(); 128 129 // Insert the relevant vectorWidth bits. 130 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); 131 132 // This is the index of the first element of the vectorWidth-bit chunk 133 // we want. 134 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) 135 * ElemsPerChunk); 136 137 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 138 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 139 VecIdx); 140} 141/// Generate a DAG to put 128-bits into a vector > 128 bits. This 142/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or 143/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a 144/// simple superregister reference. Idx is an index in the 128 bits 145/// we want. It need not be aligned to a 128-bit bounday. That makes 146/// lowering INSERT_VECTOR_ELT operations easier. 147static SDValue Insert128BitVector(SDValue Result, SDValue Vec, 148 unsigned IdxVal, SelectionDAG &DAG, 149 SDLoc dl) { 150 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); 151 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); 152} 153 154static SDValue Insert256BitVector(SDValue Result, SDValue Vec, 155 unsigned IdxVal, SelectionDAG &DAG, 156 SDLoc dl) { 157 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); 158 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); 159} 160 161/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 162/// instructions. This is used because creating CONCAT_VECTOR nodes of 163/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 164/// large BUILD_VECTORS. 165static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 166 unsigned NumElems, SelectionDAG &DAG, 167 SDLoc dl) { 168 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 169 return Insert128BitVector(V, V2, NumElems/2, DAG, dl); 170} 171 172static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, 173 unsigned NumElems, SelectionDAG &DAG, 174 SDLoc dl) { 175 SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 176 return Insert256BitVector(V, V2, NumElems/2, DAG, dl); 177} 178 179static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 180 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 181 bool is64Bit = Subtarget->is64Bit(); 182 183 if (Subtarget->isTargetEnvMacho()) { 184 if (is64Bit) 185 return new X86_64MachoTargetObjectFile(); 186 return new TargetLoweringObjectFileMachO(); 187 } 188 189 if (Subtarget->isTargetLinux()) 190 return new X86LinuxTargetObjectFile(); 191 if (Subtarget->isTargetELF()) 192 return new TargetLoweringObjectFileELF(); 193 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 194 return new TargetLoweringObjectFileCOFF(); 195 llvm_unreachable("unknown subtarget type"); 196} 197 198X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 199 : TargetLowering(TM, createTLOF(TM)) { 200 Subtarget = &TM.getSubtarget<X86Subtarget>(); 201 X86ScalarSSEf64 = Subtarget->hasSSE2(); 202 X86ScalarSSEf32 = Subtarget->hasSSE1(); 203 TD = getDataLayout(); 204 205 resetOperationActions(); 206} 207 208void X86TargetLowering::resetOperationActions() { 209 const TargetMachine &TM = getTargetMachine(); 210 static bool FirstTimeThrough = true; 211 212 // If none of the target options have changed, then we don't need to reset the 213 // operation actions. 214 if (!FirstTimeThrough && TO == TM.Options) return; 215 216 if (!FirstTimeThrough) { 217 // Reinitialize the actions. 218 initActions(); 219 FirstTimeThrough = false; 220 } 221 222 TO = TM.Options; 223 224 // Set up the TargetLowering object. 225 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 226 227 // X86 is weird, it always uses i8 for shift amounts and setcc results. 228 setBooleanContents(ZeroOrOneBooleanContent); 229 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 230 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 231 232 // For 64-bit since we have so many registers use the ILP scheduler, for 233 // 32-bit code use the register pressure specific scheduling. 234 // For Atom, always use ILP scheduling. 235 if (Subtarget->isAtom()) 236 setSchedulingPreference(Sched::ILP); 237 else if (Subtarget->is64Bit()) 238 setSchedulingPreference(Sched::ILP); 239 else 240 setSchedulingPreference(Sched::RegPressure); 241 const X86RegisterInfo *RegInfo = 242 static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); 243 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); 244 245 // Bypass expensive divides on Atom when compiling with O2 246 if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { 247 addBypassSlowDiv(32, 8); 248 if (Subtarget->is64Bit()) 249 addBypassSlowDiv(64, 16); 250 } 251 252 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 253 // Setup Windows compiler runtime calls. 254 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 255 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 256 setLibcallName(RTLIB::SREM_I64, "_allrem"); 257 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 258 setLibcallName(RTLIB::MUL_I64, "_allmul"); 259 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 260 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 261 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 262 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 263 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 264 265 // The _ftol2 runtime function has an unusual calling conv, which 266 // is modeled by a special pseudo-instruction. 267 setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); 268 setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); 269 setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); 270 setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); 271 } 272 273 if (Subtarget->isTargetDarwin()) { 274 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 275 setUseUnderscoreSetJmp(false); 276 setUseUnderscoreLongJmp(false); 277 } else if (Subtarget->isTargetMingw()) { 278 // MS runtime is weird: it exports _setjmp, but longjmp! 279 setUseUnderscoreSetJmp(true); 280 setUseUnderscoreLongJmp(false); 281 } else { 282 setUseUnderscoreSetJmp(true); 283 setUseUnderscoreLongJmp(true); 284 } 285 286 // Set up the register classes. 287 addRegisterClass(MVT::i8, &X86::GR8RegClass); 288 addRegisterClass(MVT::i16, &X86::GR16RegClass); 289 addRegisterClass(MVT::i32, &X86::GR32RegClass); 290 if (Subtarget->is64Bit()) 291 addRegisterClass(MVT::i64, &X86::GR64RegClass); 292 293 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 294 295 // We don't accept any truncstore of integer registers. 296 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 297 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 298 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 299 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 300 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 301 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 302 303 // SETOEQ and SETUNE require checking two conditions. 304 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 305 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 306 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 307 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 308 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 309 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 310 311 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 312 // operation. 313 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 314 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 315 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 316 317 if (Subtarget->is64Bit()) { 318 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 319 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 320 } else if (!TM.Options.UseSoftFloat) { 321 // We have an algorithm for SSE2->double, and we turn this into a 322 // 64-bit FILD followed by conditional FADD for other targets. 323 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 324 // We have an algorithm for SSE2, and we turn this into a 64-bit 325 // FILD for other targets. 326 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 327 } 328 329 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 330 // this operation. 331 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 332 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 333 334 if (!TM.Options.UseSoftFloat) { 335 // SSE has no i16 to fp conversion, only i32 336 if (X86ScalarSSEf32) { 337 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 338 // f32 and f64 cases are Legal, f80 case is not 339 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 340 } else { 341 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 342 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 343 } 344 } else { 345 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 346 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 347 } 348 349 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 350 // are Legal, f80 is custom lowered. 351 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 352 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 353 354 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 355 // this operation. 356 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 357 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 358 359 if (X86ScalarSSEf32) { 360 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 361 // f32 and f64 cases are Legal, f80 case is not 362 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 363 } else { 364 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 365 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 366 } 367 368 // Handle FP_TO_UINT by promoting the destination to a larger signed 369 // conversion. 370 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 371 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 372 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 373 374 if (Subtarget->is64Bit()) { 375 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 376 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 377 } else if (!TM.Options.UseSoftFloat) { 378 // Since AVX is a superset of SSE3, only check for SSE here. 379 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 380 // Expand FP_TO_UINT into a select. 381 // FIXME: We would like to use a Custom expander here eventually to do 382 // the optimal thing for SSE vs. the default expansion in the legalizer. 383 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 384 else 385 // With SSE3 we can use fisttpll to convert to a signed i64; without 386 // SSE, we're stuck with a fistpll. 387 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 388 } 389 390 if (isTargetFTOL()) { 391 // Use the _ftol2 runtime function, which has a pseudo-instruction 392 // to handle its weird calling convention. 393 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 394 } 395 396 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 397 if (!X86ScalarSSEf64) { 398 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 399 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 400 if (Subtarget->is64Bit()) { 401 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 402 // Without SSE, i64->f64 goes through memory. 403 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 404 } 405 } 406 407 // Scalar integer divide and remainder are lowered to use operations that 408 // produce two results, to match the available instructions. This exposes 409 // the two-result form to trivial CSE, which is able to combine x/y and x%y 410 // into a single instruction. 411 // 412 // Scalar integer multiply-high is also lowered to use two-result 413 // operations, to match the available instructions. However, plain multiply 414 // (low) operations are left as Legal, as there are single-result 415 // instructions for this in x86. Using the two-result multiply instructions 416 // when both high and low results are needed must be arranged by dagcombine. 417 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 418 MVT VT = IntVTs[i]; 419 setOperationAction(ISD::MULHS, VT, Expand); 420 setOperationAction(ISD::MULHU, VT, Expand); 421 setOperationAction(ISD::SDIV, VT, Expand); 422 setOperationAction(ISD::UDIV, VT, Expand); 423 setOperationAction(ISD::SREM, VT, Expand); 424 setOperationAction(ISD::UREM, VT, Expand); 425 426 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 427 setOperationAction(ISD::ADDC, VT, Custom); 428 setOperationAction(ISD::ADDE, VT, Custom); 429 setOperationAction(ISD::SUBC, VT, Custom); 430 setOperationAction(ISD::SUBE, VT, Custom); 431 } 432 433 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 434 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 435 setOperationAction(ISD::BR_CC , MVT::f32, Expand); 436 setOperationAction(ISD::BR_CC , MVT::f64, Expand); 437 setOperationAction(ISD::BR_CC , MVT::f80, Expand); 438 setOperationAction(ISD::BR_CC , MVT::i8, Expand); 439 setOperationAction(ISD::BR_CC , MVT::i16, Expand); 440 setOperationAction(ISD::BR_CC , MVT::i32, Expand); 441 setOperationAction(ISD::BR_CC , MVT::i64, Expand); 442 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 443 if (Subtarget->is64Bit()) 444 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 445 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 446 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 447 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 448 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 449 setOperationAction(ISD::FREM , MVT::f32 , Expand); 450 setOperationAction(ISD::FREM , MVT::f64 , Expand); 451 setOperationAction(ISD::FREM , MVT::f80 , Expand); 452 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 453 454 // Promote the i8 variants and force them on up to i32 which has a shorter 455 // encoding. 456 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 457 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 458 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 459 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 460 if (Subtarget->hasBMI()) { 461 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 462 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 463 if (Subtarget->is64Bit()) 464 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 465 } else { 466 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 467 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 468 if (Subtarget->is64Bit()) 469 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 470 } 471 472 if (Subtarget->hasLZCNT()) { 473 // When promoting the i8 variants, force them to i32 for a shorter 474 // encoding. 475 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 476 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 477 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 478 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 479 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 480 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 481 if (Subtarget->is64Bit()) 482 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 483 } else { 484 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 485 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 486 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 487 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 488 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 489 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 490 if (Subtarget->is64Bit()) { 491 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 492 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 493 } 494 } 495 496 if (Subtarget->hasPOPCNT()) { 497 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 498 } else { 499 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 500 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 501 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 502 if (Subtarget->is64Bit()) 503 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 504 } 505 506 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 507 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 508 509 // These should be promoted to a larger select which is supported. 510 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 511 // X86 wants to expand cmov itself. 512 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 513 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 514 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 515 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 516 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 517 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 518 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 519 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 520 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 521 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 522 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 523 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 524 if (Subtarget->is64Bit()) { 525 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 526 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 527 } 528 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 529 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 530 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 531 // support continuation, user-level threading, and etc.. As a result, no 532 // other SjLj exception interfaces are implemented and please don't build 533 // your own exception handling based on them. 534 // LLVM/Clang supports zero-cost DWARF exception handling. 535 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 536 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 537 538 // Darwin ABI issue. 539 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 540 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 541 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 542 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 543 if (Subtarget->is64Bit()) 544 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 545 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 546 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 547 if (Subtarget->is64Bit()) { 548 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 549 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 550 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 551 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 552 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 553 } 554 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 555 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 556 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 557 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 558 if (Subtarget->is64Bit()) { 559 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 560 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 561 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 562 } 563 564 if (Subtarget->hasSSE1()) 565 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 566 567 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 568 569 // Expand certain atomics 570 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 571 MVT VT = IntVTs[i]; 572 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 573 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 574 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 575 } 576 577 if (!Subtarget->is64Bit()) { 578 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); 579 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 580 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 581 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 582 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 583 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 584 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 585 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 586 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); 587 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); 588 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); 589 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); 590 } 591 592 if (Subtarget->hasCmpxchg16b()) { 593 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 594 } 595 596 // FIXME - use subtarget debug flags 597 if (!Subtarget->isTargetDarwin() && 598 !Subtarget->isTargetELF() && 599 !Subtarget->isTargetCygMing()) { 600 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 601 } 602 603 if (Subtarget->is64Bit()) { 604 setExceptionPointerRegister(X86::RAX); 605 setExceptionSelectorRegister(X86::RDX); 606 } else { 607 setExceptionPointerRegister(X86::EAX); 608 setExceptionSelectorRegister(X86::EDX); 609 } 610 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 611 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 612 613 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 614 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 615 616 setOperationAction(ISD::TRAP, MVT::Other, Legal); 617 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 618 619 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 620 setOperationAction(ISD::VASTART , MVT::Other, Custom); 621 setOperationAction(ISD::VAEND , MVT::Other, Expand); 622 if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { 623 // TargetInfo::X86_64ABIBuiltinVaList 624 setOperationAction(ISD::VAARG , MVT::Other, Custom); 625 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 626 } else { 627 // TargetInfo::CharPtrBuiltinVaList 628 setOperationAction(ISD::VAARG , MVT::Other, Expand); 629 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 630 } 631 632 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 633 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 634 635 if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho()) 636 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 637 MVT::i64 : MVT::i32, Custom); 638 else if (TM.Options.EnableSegmentedStacks) 639 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 640 MVT::i64 : MVT::i32, Custom); 641 else 642 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 643 MVT::i64 : MVT::i32, Expand); 644 645 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 646 // f32 and f64 use SSE. 647 // Set up the FP register classes. 648 addRegisterClass(MVT::f32, &X86::FR32RegClass); 649 addRegisterClass(MVT::f64, &X86::FR64RegClass); 650 651 // Use ANDPD to simulate FABS. 652 setOperationAction(ISD::FABS , MVT::f64, Custom); 653 setOperationAction(ISD::FABS , MVT::f32, Custom); 654 655 // Use XORP to simulate FNEG. 656 setOperationAction(ISD::FNEG , MVT::f64, Custom); 657 setOperationAction(ISD::FNEG , MVT::f32, Custom); 658 659 // Use ANDPD and ORPD to simulate FCOPYSIGN. 660 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 661 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 662 663 // Lower this to FGETSIGNx86 plus an AND. 664 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 665 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 666 667 // We don't support sin/cos/fmod 668 setOperationAction(ISD::FSIN , MVT::f64, Expand); 669 setOperationAction(ISD::FCOS , MVT::f64, Expand); 670 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 671 setOperationAction(ISD::FSIN , MVT::f32, Expand); 672 setOperationAction(ISD::FCOS , MVT::f32, Expand); 673 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 674 675 // Expand FP immediates into loads from the stack, except for the special 676 // cases we handle. 677 addLegalFPImmediate(APFloat(+0.0)); // xorpd 678 addLegalFPImmediate(APFloat(+0.0f)); // xorps 679 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 680 // Use SSE for f32, x87 for f64. 681 // Set up the FP register classes. 682 addRegisterClass(MVT::f32, &X86::FR32RegClass); 683 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 684 685 // Use ANDPS to simulate FABS. 686 setOperationAction(ISD::FABS , MVT::f32, Custom); 687 688 // Use XORP to simulate FNEG. 689 setOperationAction(ISD::FNEG , MVT::f32, Custom); 690 691 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 692 693 // Use ANDPS and ORPS to simulate FCOPYSIGN. 694 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 695 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 696 697 // We don't support sin/cos/fmod 698 setOperationAction(ISD::FSIN , MVT::f32, Expand); 699 setOperationAction(ISD::FCOS , MVT::f32, Expand); 700 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 701 702 // Special cases we handle for FP constants. 703 addLegalFPImmediate(APFloat(+0.0f)); // xorps 704 addLegalFPImmediate(APFloat(+0.0)); // FLD0 705 addLegalFPImmediate(APFloat(+1.0)); // FLD1 706 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 707 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 708 709 if (!TM.Options.UnsafeFPMath) { 710 setOperationAction(ISD::FSIN , MVT::f64, Expand); 711 setOperationAction(ISD::FCOS , MVT::f64, Expand); 712 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 713 } 714 } else if (!TM.Options.UseSoftFloat) { 715 // f32 and f64 in x87. 716 // Set up the FP register classes. 717 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 718 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 719 720 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 721 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 722 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 723 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 724 725 if (!TM.Options.UnsafeFPMath) { 726 setOperationAction(ISD::FSIN , MVT::f64, Expand); 727 setOperationAction(ISD::FSIN , MVT::f32, Expand); 728 setOperationAction(ISD::FCOS , MVT::f64, Expand); 729 setOperationAction(ISD::FCOS , MVT::f32, Expand); 730 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 731 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 732 } 733 addLegalFPImmediate(APFloat(+0.0)); // FLD0 734 addLegalFPImmediate(APFloat(+1.0)); // FLD1 735 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 736 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 737 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 738 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 739 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 740 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 741 } 742 743 // We don't support FMA. 744 setOperationAction(ISD::FMA, MVT::f64, Expand); 745 setOperationAction(ISD::FMA, MVT::f32, Expand); 746 747 // Long double always uses X87. 748 if (!TM.Options.UseSoftFloat) { 749 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 750 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 751 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 752 { 753 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 754 addLegalFPImmediate(TmpFlt); // FLD0 755 TmpFlt.changeSign(); 756 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 757 758 bool ignored; 759 APFloat TmpFlt2(+1.0); 760 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 761 &ignored); 762 addLegalFPImmediate(TmpFlt2); // FLD1 763 TmpFlt2.changeSign(); 764 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 765 } 766 767 if (!TM.Options.UnsafeFPMath) { 768 setOperationAction(ISD::FSIN , MVT::f80, Expand); 769 setOperationAction(ISD::FCOS , MVT::f80, Expand); 770 setOperationAction(ISD::FSINCOS, MVT::f80, Expand); 771 } 772 773 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 774 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 775 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 776 setOperationAction(ISD::FRINT, MVT::f80, Expand); 777 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 778 setOperationAction(ISD::FMA, MVT::f80, Expand); 779 } 780 781 // Always use a library call for pow. 782 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 783 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 784 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 785 786 setOperationAction(ISD::FLOG, MVT::f80, Expand); 787 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 788 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 789 setOperationAction(ISD::FEXP, MVT::f80, Expand); 790 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 791 792 // First set operation action for all vector types to either promote 793 // (for widening) or expand (for scalarization). Then we will selectively 794 // turn on ones that can be effectively codegen'd. 795 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 796 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 797 MVT VT = (MVT::SimpleValueType)i; 798 setOperationAction(ISD::ADD , VT, Expand); 799 setOperationAction(ISD::SUB , VT, Expand); 800 setOperationAction(ISD::FADD, VT, Expand); 801 setOperationAction(ISD::FNEG, VT, Expand); 802 setOperationAction(ISD::FSUB, VT, Expand); 803 setOperationAction(ISD::MUL , VT, Expand); 804 setOperationAction(ISD::FMUL, VT, Expand); 805 setOperationAction(ISD::SDIV, VT, Expand); 806 setOperationAction(ISD::UDIV, VT, Expand); 807 setOperationAction(ISD::FDIV, VT, Expand); 808 setOperationAction(ISD::SREM, VT, Expand); 809 setOperationAction(ISD::UREM, VT, Expand); 810 setOperationAction(ISD::LOAD, VT, Expand); 811 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 812 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); 813 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 814 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); 815 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); 816 setOperationAction(ISD::FABS, VT, Expand); 817 setOperationAction(ISD::FSIN, VT, Expand); 818 setOperationAction(ISD::FSINCOS, VT, Expand); 819 setOperationAction(ISD::FCOS, VT, Expand); 820 setOperationAction(ISD::FSINCOS, VT, Expand); 821 setOperationAction(ISD::FREM, VT, Expand); 822 setOperationAction(ISD::FMA, VT, Expand); 823 setOperationAction(ISD::FPOWI, VT, Expand); 824 setOperationAction(ISD::FSQRT, VT, Expand); 825 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 826 setOperationAction(ISD::FFLOOR, VT, Expand); 827 setOperationAction(ISD::FCEIL, VT, Expand); 828 setOperationAction(ISD::FTRUNC, VT, Expand); 829 setOperationAction(ISD::FRINT, VT, Expand); 830 setOperationAction(ISD::FNEARBYINT, VT, Expand); 831 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 832 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 833 setOperationAction(ISD::SDIVREM, VT, Expand); 834 setOperationAction(ISD::UDIVREM, VT, Expand); 835 setOperationAction(ISD::FPOW, VT, Expand); 836 setOperationAction(ISD::CTPOP, VT, Expand); 837 setOperationAction(ISD::CTTZ, VT, Expand); 838 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 839 setOperationAction(ISD::CTLZ, VT, Expand); 840 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 841 setOperationAction(ISD::SHL, VT, Expand); 842 setOperationAction(ISD::SRA, VT, Expand); 843 setOperationAction(ISD::SRL, VT, Expand); 844 setOperationAction(ISD::ROTL, VT, Expand); 845 setOperationAction(ISD::ROTR, VT, Expand); 846 setOperationAction(ISD::BSWAP, VT, Expand); 847 setOperationAction(ISD::SETCC, VT, Expand); 848 setOperationAction(ISD::FLOG, VT, Expand); 849 setOperationAction(ISD::FLOG2, VT, Expand); 850 setOperationAction(ISD::FLOG10, VT, Expand); 851 setOperationAction(ISD::FEXP, VT, Expand); 852 setOperationAction(ISD::FEXP2, VT, Expand); 853 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 854 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 855 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 856 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 857 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); 858 setOperationAction(ISD::TRUNCATE, VT, Expand); 859 setOperationAction(ISD::SIGN_EXTEND, VT, Expand); 860 setOperationAction(ISD::ZERO_EXTEND, VT, Expand); 861 setOperationAction(ISD::ANY_EXTEND, VT, Expand); 862 setOperationAction(ISD::VSELECT, VT, Expand); 863 for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; 864 InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 865 setTruncStoreAction(VT, 866 (MVT::SimpleValueType)InnerVT, Expand); 867 setLoadExtAction(ISD::SEXTLOAD, VT, Expand); 868 setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); 869 setLoadExtAction(ISD::EXTLOAD, VT, Expand); 870 } 871 872 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 873 // with -msoft-float, disable use of MMX as well. 874 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 875 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 876 // No operations on x86mmx supported, everything uses intrinsics. 877 } 878 879 // MMX-sized vectors (other than x86mmx) are expected to be expanded 880 // into smaller operations. 881 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 882 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 883 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 884 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 885 setOperationAction(ISD::AND, MVT::v8i8, Expand); 886 setOperationAction(ISD::AND, MVT::v4i16, Expand); 887 setOperationAction(ISD::AND, MVT::v2i32, Expand); 888 setOperationAction(ISD::AND, MVT::v1i64, Expand); 889 setOperationAction(ISD::OR, MVT::v8i8, Expand); 890 setOperationAction(ISD::OR, MVT::v4i16, Expand); 891 setOperationAction(ISD::OR, MVT::v2i32, Expand); 892 setOperationAction(ISD::OR, MVT::v1i64, Expand); 893 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 894 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 895 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 896 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 897 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 898 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 899 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 900 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 901 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 902 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 903 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 904 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 905 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 906 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 907 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 908 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 909 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 910 911 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { 912 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 913 914 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 915 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 916 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 917 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 918 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 919 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 920 setOperationAction(ISD::FABS, MVT::v4f32, Custom); 921 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 922 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 923 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 924 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 925 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 926 } 927 928 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { 929 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 930 931 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 932 // registers cannot be used even for integer operations. 933 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 934 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 935 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 936 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 937 938 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 939 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 940 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 941 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 942 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 943 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 944 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 945 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 946 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 947 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 948 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 949 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 950 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 951 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 952 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 953 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 954 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 955 setOperationAction(ISD::FABS, MVT::v2f64, Custom); 956 957 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 958 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 959 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 960 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 961 962 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 963 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 964 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 965 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 966 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 967 968 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 969 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 970 MVT VT = (MVT::SimpleValueType)i; 971 // Do not attempt to custom lower non-power-of-2 vectors 972 if (!isPowerOf2_32(VT.getVectorNumElements())) 973 continue; 974 // Do not attempt to custom lower non-128-bit vectors 975 if (!VT.is128BitVector()) 976 continue; 977 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 978 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 979 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 980 } 981 982 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 983 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 984 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 985 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 986 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 987 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 988 989 if (Subtarget->is64Bit()) { 990 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 991 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 992 } 993 994 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 995 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 996 MVT VT = (MVT::SimpleValueType)i; 997 998 // Do not attempt to promote non-128-bit vectors 999 if (!VT.is128BitVector()) 1000 continue; 1001 1002 setOperationAction(ISD::AND, VT, Promote); 1003 AddPromotedToType (ISD::AND, VT, MVT::v2i64); 1004 setOperationAction(ISD::OR, VT, Promote); 1005 AddPromotedToType (ISD::OR, VT, MVT::v2i64); 1006 setOperationAction(ISD::XOR, VT, Promote); 1007 AddPromotedToType (ISD::XOR, VT, MVT::v2i64); 1008 setOperationAction(ISD::LOAD, VT, Promote); 1009 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); 1010 setOperationAction(ISD::SELECT, VT, Promote); 1011 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); 1012 } 1013 1014 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1015 1016 // Custom lower v2i64 and v2f64 selects. 1017 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 1018 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 1019 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 1020 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 1021 1022 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 1023 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 1024 1025 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); 1026 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 1027 // As there is no 64-bit GPR available, we need build a special custom 1028 // sequence to convert from v2i32 to v2f32. 1029 if (!Subtarget->is64Bit()) 1030 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); 1031 1032 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); 1033 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); 1034 1035 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); 1036 } 1037 1038 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { 1039 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1040 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1041 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1042 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1043 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1044 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1045 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1046 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1047 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1048 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1049 1050 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 1051 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 1052 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 1053 setOperationAction(ISD::FRINT, MVT::v4f32, Legal); 1054 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 1055 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 1056 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 1057 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 1058 setOperationAction(ISD::FRINT, MVT::v2f64, Legal); 1059 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 1060 1061 // FIXME: Do we need to handle scalar-to-vector here? 1062 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 1063 1064 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 1065 setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); 1066 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 1067 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 1068 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 1069 1070 // i8 and i16 vectors are custom , because the source register and source 1071 // source memory operand types are not the same width. f32 vectors are 1072 // custom since the immediate controlling the insert encodes additional 1073 // information. 1074 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 1075 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 1076 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 1077 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 1078 1079 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 1080 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 1081 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 1082 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 1083 1084 // FIXME: these should be Legal but thats only for the case where 1085 // the index is constant. For now custom expand to deal with that. 1086 if (Subtarget->is64Bit()) { 1087 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 1088 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 1089 } 1090 } 1091 1092 if (Subtarget->hasSSE2()) { 1093 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 1094 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 1095 1096 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 1097 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 1098 1099 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 1100 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 1101 1102 // In the customized shift lowering, the legal cases in AVX2 will be 1103 // recognized. 1104 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 1105 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 1106 1107 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 1108 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 1109 1110 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 1111 1112 setOperationAction(ISD::SDIV, MVT::v8i16, Custom); 1113 setOperationAction(ISD::SDIV, MVT::v4i32, Custom); 1114 } 1115 1116 if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { 1117 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 1118 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 1119 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 1120 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 1121 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 1122 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 1123 1124 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1125 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1126 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1127 1128 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1129 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1130 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1131 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1132 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1133 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); 1134 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); 1135 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); 1136 setOperationAction(ISD::FRINT, MVT::v8f32, Legal); 1137 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); 1138 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1139 setOperationAction(ISD::FABS, MVT::v8f32, Custom); 1140 1141 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1142 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1143 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1144 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1145 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1146 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 1147 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 1148 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 1149 setOperationAction(ISD::FRINT, MVT::v4f64, Legal); 1150 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); 1151 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1152 setOperationAction(ISD::FABS, MVT::v4f64, Custom); 1153 1154 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 1155 1156 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1157 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 1158 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1159 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1160 1161 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 1162 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 1163 1164 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); 1165 1166 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1167 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1168 1169 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1170 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1171 1172 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1173 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1174 1175 setOperationAction(ISD::SDIV, MVT::v16i16, Custom); 1176 1177 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1178 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1179 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1180 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1181 1182 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1183 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1184 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1185 1186 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 1187 setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); 1188 setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); 1189 setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); 1190 1191 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); 1192 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 1193 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1194 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); 1195 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 1196 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); 1197 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); 1198 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); 1199 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); 1200 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1201 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); 1202 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); 1203 1204 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { 1205 setOperationAction(ISD::FMA, MVT::v8f32, Legal); 1206 setOperationAction(ISD::FMA, MVT::v4f64, Legal); 1207 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 1208 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 1209 setOperationAction(ISD::FMA, MVT::f32, Legal); 1210 setOperationAction(ISD::FMA, MVT::f64, Legal); 1211 } 1212 1213 if (Subtarget->hasInt256()) { 1214 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1215 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1216 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1217 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1218 1219 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1220 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1221 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1222 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1223 1224 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1225 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1226 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1227 // Don't lower v32i8 because there is no 128-bit byte mul 1228 1229 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1230 1231 setOperationAction(ISD::SDIV, MVT::v8i32, Custom); 1232 } else { 1233 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1234 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1235 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1236 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1237 1238 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1239 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1240 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1241 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1242 1243 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1244 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1245 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1246 // Don't lower v32i8 because there is no 128-bit byte mul 1247 } 1248 1249 // In the customized shift lowering, the legal cases in AVX2 will be 1250 // recognized. 1251 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1252 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1253 1254 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1255 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1256 1257 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1258 1259 // Custom lower several nodes for 256-bit types. 1260 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 1261 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 1262 MVT VT = (MVT::SimpleValueType)i; 1263 1264 // Extract subvector is special because the value type 1265 // (result) is 128-bit but the source is 256-bit wide. 1266 if (VT.is128BitVector()) 1267 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1268 1269 // Do not attempt to custom lower other non-256-bit vectors 1270 if (!VT.is256BitVector()) 1271 continue; 1272 1273 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1274 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1275 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1276 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1277 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1278 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1279 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1280 } 1281 1282 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1283 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { 1284 MVT VT = (MVT::SimpleValueType)i; 1285 1286 // Do not attempt to promote non-256-bit vectors 1287 if (!VT.is256BitVector()) 1288 continue; 1289 1290 setOperationAction(ISD::AND, VT, Promote); 1291 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 1292 setOperationAction(ISD::OR, VT, Promote); 1293 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 1294 setOperationAction(ISD::XOR, VT, Promote); 1295 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 1296 setOperationAction(ISD::LOAD, VT, Promote); 1297 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 1298 setOperationAction(ISD::SELECT, VT, Promote); 1299 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 1300 } 1301 } 1302 1303 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { 1304 addRegisterClass(MVT::v16i32, &X86::VR512RegClass); 1305 addRegisterClass(MVT::v16f32, &X86::VR512RegClass); 1306 addRegisterClass(MVT::v8i64, &X86::VR512RegClass); 1307 addRegisterClass(MVT::v8f64, &X86::VR512RegClass); 1308 1309 addRegisterClass(MVT::v8i1, &X86::VK8RegClass); 1310 addRegisterClass(MVT::v16i1, &X86::VK16RegClass); 1311 1312 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); 1313 setOperationAction(ISD::LOAD, MVT::v16f32, Legal); 1314 setOperationAction(ISD::LOAD, MVT::v8f64, Legal); 1315 setOperationAction(ISD::LOAD, MVT::v8i64, Legal); 1316 setOperationAction(ISD::LOAD, MVT::v16i32, Legal); 1317 setOperationAction(ISD::LOAD, MVT::v16i1, Legal); 1318 1319 setOperationAction(ISD::FADD, MVT::v16f32, Legal); 1320 setOperationAction(ISD::FSUB, MVT::v16f32, Legal); 1321 setOperationAction(ISD::FMUL, MVT::v16f32, Legal); 1322 setOperationAction(ISD::FDIV, MVT::v16f32, Legal); 1323 setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); 1324 setOperationAction(ISD::FNEG, MVT::v16f32, Custom); 1325 1326 setOperationAction(ISD::FADD, MVT::v8f64, Legal); 1327 setOperationAction(ISD::FSUB, MVT::v8f64, Legal); 1328 setOperationAction(ISD::FMUL, MVT::v8f64, Legal); 1329 setOperationAction(ISD::FDIV, MVT::v8f64, Legal); 1330 setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); 1331 setOperationAction(ISD::FNEG, MVT::v8f64, Custom); 1332 setOperationAction(ISD::FMA, MVT::v8f64, Legal); 1333 setOperationAction(ISD::FMA, MVT::v16f32, Legal); 1334 setOperationAction(ISD::SDIV, MVT::v16i32, Custom); 1335 1336 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); 1337 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); 1338 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); 1339 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); 1340 if (Subtarget->is64Bit()) { 1341 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); 1342 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); 1343 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); 1344 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); 1345 } 1346 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); 1347 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); 1348 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); 1349 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); 1350 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); 1351 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); 1352 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); 1353 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); 1354 1355 setOperationAction(ISD::TRUNCATE, MVT::i1, Legal); 1356 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 1357 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); 1358 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); 1359 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); 1360 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); 1361 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); 1362 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); 1363 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); 1364 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); 1365 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); 1366 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 1367 1368 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); 1369 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); 1370 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); 1371 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); 1372 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); 1373 1374 setOperationAction(ISD::SETCC, MVT::v16i1, Custom); 1375 setOperationAction(ISD::SETCC, MVT::v8i1, Custom); 1376 1377 setOperationAction(ISD::MUL, MVT::v8i64, Custom); 1378 1379 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); 1380 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); 1381 setOperationAction(ISD::SELECT, MVT::v8f64, Custom); 1382 setOperationAction(ISD::SELECT, MVT::v8i64, Custom); 1383 setOperationAction(ISD::SELECT, MVT::v16f32, Custom); 1384 1385 setOperationAction(ISD::ADD, MVT::v8i64, Legal); 1386 setOperationAction(ISD::ADD, MVT::v16i32, Legal); 1387 1388 setOperationAction(ISD::SUB, MVT::v8i64, Legal); 1389 setOperationAction(ISD::SUB, MVT::v16i32, Legal); 1390 1391 setOperationAction(ISD::MUL, MVT::v16i32, Legal); 1392 1393 setOperationAction(ISD::SRL, MVT::v8i64, Custom); 1394 setOperationAction(ISD::SRL, MVT::v16i32, Custom); 1395 1396 setOperationAction(ISD::SHL, MVT::v8i64, Custom); 1397 setOperationAction(ISD::SHL, MVT::v16i32, Custom); 1398 1399 setOperationAction(ISD::SRA, MVT::v8i64, Custom); 1400 setOperationAction(ISD::SRA, MVT::v16i32, Custom); 1401 1402 setOperationAction(ISD::AND, MVT::v8i64, Legal); 1403 setOperationAction(ISD::OR, MVT::v8i64, Legal); 1404 setOperationAction(ISD::XOR, MVT::v8i64, Legal); 1405 setOperationAction(ISD::AND, MVT::v16i32, Legal); 1406 setOperationAction(ISD::OR, MVT::v16i32, Legal); 1407 setOperationAction(ISD::XOR, MVT::v16i32, Legal); 1408 1409 // Custom lower several nodes. 1410 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 1411 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 1412 MVT VT = (MVT::SimpleValueType)i; 1413 1414 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 1415 // Extract subvector is special because the value type 1416 // (result) is 256/128-bit but the source is 512-bit wide. 1417 if (VT.is128BitVector() || VT.is256BitVector()) 1418 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1419 1420 if (VT.getVectorElementType() == MVT::i1) 1421 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 1422 1423 // Do not attempt to custom lower other non-512-bit vectors 1424 if (!VT.is512BitVector()) 1425 continue; 1426 1427 if ( EltSize >= 32) { 1428 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1429 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1430 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1431 setOperationAction(ISD::VSELECT, VT, Legal); 1432 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1433 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1434 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1435 } 1436 } 1437 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { 1438 MVT VT = (MVT::SimpleValueType)i; 1439 1440 // Do not attempt to promote non-256-bit vectors 1441 if (!VT.is512BitVector()) 1442 continue; 1443 1444 setOperationAction(ISD::SELECT, VT, Promote); 1445 AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); 1446 } 1447 }// has AVX-512 1448 1449 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1450 // of this type with custom code. 1451 for (int VT = MVT::FIRST_VECTOR_VALUETYPE; 1452 VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { 1453 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, 1454 Custom); 1455 } 1456 1457 // We want to custom lower some of our intrinsics. 1458 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1459 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1460 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1461 1462 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1463 // handle type legalization for these operations here. 1464 // 1465 // FIXME: We really should do custom legalization for addition and 1466 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1467 // than generic legalization for 64-bit multiplication-with-overflow, though. 1468 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1469 // Add/Sub/Mul with overflow operations are custom lowered. 1470 MVT VT = IntVTs[i]; 1471 setOperationAction(ISD::SADDO, VT, Custom); 1472 setOperationAction(ISD::UADDO, VT, Custom); 1473 setOperationAction(ISD::SSUBO, VT, Custom); 1474 setOperationAction(ISD::USUBO, VT, Custom); 1475 setOperationAction(ISD::SMULO, VT, Custom); 1476 setOperationAction(ISD::UMULO, VT, Custom); 1477 } 1478 1479 // There are no 8-bit 3-address imul/mul instructions 1480 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1481 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1482 1483 if (!Subtarget->is64Bit()) { 1484 // These libcalls are not available in 32-bit. 1485 setLibcallName(RTLIB::SHL_I128, 0); 1486 setLibcallName(RTLIB::SRL_I128, 0); 1487 setLibcallName(RTLIB::SRA_I128, 0); 1488 } 1489 1490 // Combine sin / cos into one node or libcall if possible. 1491 if (Subtarget->hasSinCos()) { 1492 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 1493 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 1494 if (Subtarget->isTargetDarwin()) { 1495 // For MacOSX, we don't want to the normal expansion of a libcall to 1496 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 1497 // traffic. 1498 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1499 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1500 } 1501 } 1502 1503 // We have target-specific dag combine patterns for the following nodes: 1504 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1505 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1506 setTargetDAGCombine(ISD::VSELECT); 1507 setTargetDAGCombine(ISD::SELECT); 1508 setTargetDAGCombine(ISD::SHL); 1509 setTargetDAGCombine(ISD::SRA); 1510 setTargetDAGCombine(ISD::SRL); 1511 setTargetDAGCombine(ISD::OR); 1512 setTargetDAGCombine(ISD::AND); 1513 setTargetDAGCombine(ISD::ADD); 1514 setTargetDAGCombine(ISD::FADD); 1515 setTargetDAGCombine(ISD::FSUB); 1516 setTargetDAGCombine(ISD::FMA); 1517 setTargetDAGCombine(ISD::SUB); 1518 setTargetDAGCombine(ISD::LOAD); 1519 setTargetDAGCombine(ISD::STORE); 1520 setTargetDAGCombine(ISD::ZERO_EXTEND); 1521 setTargetDAGCombine(ISD::ANY_EXTEND); 1522 setTargetDAGCombine(ISD::SIGN_EXTEND); 1523 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 1524 setTargetDAGCombine(ISD::TRUNCATE); 1525 setTargetDAGCombine(ISD::SINT_TO_FP); 1526 setTargetDAGCombine(ISD::SETCC); 1527 if (Subtarget->is64Bit()) 1528 setTargetDAGCombine(ISD::MUL); 1529 setTargetDAGCombine(ISD::XOR); 1530 1531 computeRegisterProperties(); 1532 1533 // On Darwin, -Os means optimize for size without hurting performance, 1534 // do not reduce the limit. 1535 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1536 MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1537 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1538 MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1539 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1540 MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1541 setPrefLoopAlignment(4); // 2^4 bytes. 1542 1543 // Predictable cmov don't hurt on atom because it's in-order. 1544 PredictableSelectIsExpensive = !Subtarget->isAtom(); 1545 1546 setPrefFunctionAlignment(4); // 2^4 bytes. 1547} 1548 1549EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1550 if (!VT.isVector()) 1551 return MVT::i8; 1552 1553 const TargetMachine &TM = getTargetMachine(); 1554 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) 1555 switch(VT.getVectorNumElements()) { 1556 case 8: return MVT::v8i1; 1557 case 16: return MVT::v16i1; 1558 } 1559 1560 return VT.changeVectorElementTypeToInteger(); 1561} 1562 1563/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1564/// the desired ByVal argument alignment. 1565static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1566 if (MaxAlign == 16) 1567 return; 1568 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1569 if (VTy->getBitWidth() == 128) 1570 MaxAlign = 16; 1571 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1572 unsigned EltAlign = 0; 1573 getMaxByValAlign(ATy->getElementType(), EltAlign); 1574 if (EltAlign > MaxAlign) 1575 MaxAlign = EltAlign; 1576 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1577 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1578 unsigned EltAlign = 0; 1579 getMaxByValAlign(STy->getElementType(i), EltAlign); 1580 if (EltAlign > MaxAlign) 1581 MaxAlign = EltAlign; 1582 if (MaxAlign == 16) 1583 break; 1584 } 1585 } 1586} 1587 1588/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1589/// function arguments in the caller parameter area. For X86, aggregates 1590/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1591/// are at 4-byte boundaries. 1592unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1593 if (Subtarget->is64Bit()) { 1594 // Max of 8 and alignment of type. 1595 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1596 if (TyAlign > 8) 1597 return TyAlign; 1598 return 8; 1599 } 1600 1601 unsigned Align = 4; 1602 if (Subtarget->hasSSE1()) 1603 getMaxByValAlign(Ty, Align); 1604 return Align; 1605} 1606 1607/// getOptimalMemOpType - Returns the target specific optimal type for load 1608/// and store operations as a result of memset, memcpy, and memmove 1609/// lowering. If DstAlign is zero that means it's safe to destination 1610/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1611/// means there isn't a need to check it against alignment requirement, 1612/// probably because the source does not need to be loaded. If 'IsMemset' is 1613/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 1614/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 1615/// source is constant so it does not need to be loaded. 1616/// It returns EVT::Other if the type should be determined using generic 1617/// target-independent logic. 1618EVT 1619X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1620 unsigned DstAlign, unsigned SrcAlign, 1621 bool IsMemset, bool ZeroMemset, 1622 bool MemcpyStrSrc, 1623 MachineFunction &MF) const { 1624 const Function *F = MF.getFunction(); 1625 if ((!IsMemset || ZeroMemset) && 1626 !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, 1627 Attribute::NoImplicitFloat)) { 1628 if (Size >= 16 && 1629 (Subtarget->isUnalignedMemAccessFast() || 1630 ((DstAlign == 0 || DstAlign >= 16) && 1631 (SrcAlign == 0 || SrcAlign >= 16)))) { 1632 if (Size >= 32) { 1633 if (Subtarget->hasInt256()) 1634 return MVT::v8i32; 1635 if (Subtarget->hasFp256()) 1636 return MVT::v8f32; 1637 } 1638 if (Subtarget->hasSSE2()) 1639 return MVT::v4i32; 1640 if (Subtarget->hasSSE1()) 1641 return MVT::v4f32; 1642 } else if (!MemcpyStrSrc && Size >= 8 && 1643 !Subtarget->is64Bit() && 1644 Subtarget->hasSSE2()) { 1645 // Do not use f64 to lower memcpy if source is string constant. It's 1646 // better to use i32 to avoid the loads. 1647 return MVT::f64; 1648 } 1649 } 1650 if (Subtarget->is64Bit() && Size >= 8) 1651 return MVT::i64; 1652 return MVT::i32; 1653} 1654 1655bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 1656 if (VT == MVT::f32) 1657 return X86ScalarSSEf32; 1658 else if (VT == MVT::f64) 1659 return X86ScalarSSEf64; 1660 return true; 1661} 1662 1663bool 1664X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { 1665 if (Fast) 1666 *Fast = Subtarget->isUnalignedMemAccessFast(); 1667 return true; 1668} 1669 1670/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1671/// current function. The returned value is a member of the 1672/// MachineJumpTableInfo::JTEntryKind enum. 1673unsigned X86TargetLowering::getJumpTableEncoding() const { 1674 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1675 // symbol. 1676 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1677 Subtarget->isPICStyleGOT()) 1678 return MachineJumpTableInfo::EK_Custom32; 1679 1680 // Otherwise, use the normal jump table encoding heuristics. 1681 return TargetLowering::getJumpTableEncoding(); 1682} 1683 1684const MCExpr * 1685X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1686 const MachineBasicBlock *MBB, 1687 unsigned uid,MCContext &Ctx) const{ 1688 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1689 Subtarget->isPICStyleGOT()); 1690 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1691 // entries. 1692 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1693 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1694} 1695 1696/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1697/// jumptable. 1698SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1699 SelectionDAG &DAG) const { 1700 if (!Subtarget->is64Bit()) 1701 // This doesn't have SDLoc associated with it, but is not really the 1702 // same as a Register. 1703 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); 1704 return Table; 1705} 1706 1707/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1708/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1709/// MCExpr. 1710const MCExpr *X86TargetLowering:: 1711getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1712 MCContext &Ctx) const { 1713 // X86-64 uses RIP relative addressing based on the jump table label. 1714 if (Subtarget->isPICStyleRIPRel()) 1715 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1716 1717 // Otherwise, the reference is relative to the PIC base. 1718 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1719} 1720 1721// FIXME: Why this routine is here? Move to RegInfo! 1722std::pair<const TargetRegisterClass*, uint8_t> 1723X86TargetLowering::findRepresentativeClass(MVT VT) const{ 1724 const TargetRegisterClass *RRC = 0; 1725 uint8_t Cost = 1; 1726 switch (VT.SimpleTy) { 1727 default: 1728 return TargetLowering::findRepresentativeClass(VT); 1729 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1730 RRC = Subtarget->is64Bit() ? 1731 (const TargetRegisterClass*)&X86::GR64RegClass : 1732 (const TargetRegisterClass*)&X86::GR32RegClass; 1733 break; 1734 case MVT::x86mmx: 1735 RRC = &X86::VR64RegClass; 1736 break; 1737 case MVT::f32: case MVT::f64: 1738 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1739 case MVT::v4f32: case MVT::v2f64: 1740 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1741 case MVT::v4f64: 1742 RRC = &X86::VR128RegClass; 1743 break; 1744 } 1745 return std::make_pair(RRC, Cost); 1746} 1747 1748bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1749 unsigned &Offset) const { 1750 if (!Subtarget->isTargetLinux()) 1751 return false; 1752 1753 if (Subtarget->is64Bit()) { 1754 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1755 Offset = 0x28; 1756 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1757 AddressSpace = 256; 1758 else 1759 AddressSpace = 257; 1760 } else { 1761 // %gs:0x14 on i386 1762 Offset = 0x14; 1763 AddressSpace = 256; 1764 } 1765 return true; 1766} 1767 1768bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 1769 unsigned DestAS) const { 1770 assert(SrcAS != DestAS && "Expected different address spaces!"); 1771 1772 return SrcAS < 256 && DestAS < 256; 1773} 1774 1775//===----------------------------------------------------------------------===// 1776// Return Value Calling Convention Implementation 1777//===----------------------------------------------------------------------===// 1778 1779#include "X86GenCallingConv.inc" 1780 1781bool 1782X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1783 MachineFunction &MF, bool isVarArg, 1784 const SmallVectorImpl<ISD::OutputArg> &Outs, 1785 LLVMContext &Context) const { 1786 SmallVector<CCValAssign, 16> RVLocs; 1787 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1788 RVLocs, Context); 1789 return CCInfo.CheckReturn(Outs, RetCC_X86); 1790} 1791 1792const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { 1793 static const uint16_t ScratchRegs[] = { X86::R11, 0 }; 1794 return ScratchRegs; 1795} 1796 1797SDValue 1798X86TargetLowering::LowerReturn(SDValue Chain, 1799 CallingConv::ID CallConv, bool isVarArg, 1800 const SmallVectorImpl<ISD::OutputArg> &Outs, 1801 const SmallVectorImpl<SDValue> &OutVals, 1802 SDLoc dl, SelectionDAG &DAG) const { 1803 MachineFunction &MF = DAG.getMachineFunction(); 1804 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1805 1806 SmallVector<CCValAssign, 16> RVLocs; 1807 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1808 RVLocs, *DAG.getContext()); 1809 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1810 1811 SDValue Flag; 1812 SmallVector<SDValue, 6> RetOps; 1813 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1814 // Operand #1 = Bytes To Pop 1815 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1816 MVT::i16)); 1817 1818 // Copy the result values into the output registers. 1819 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1820 CCValAssign &VA = RVLocs[i]; 1821 assert(VA.isRegLoc() && "Can only return in registers!"); 1822 SDValue ValToCopy = OutVals[i]; 1823 EVT ValVT = ValToCopy.getValueType(); 1824 1825 // Promote values to the appropriate types 1826 if (VA.getLocInfo() == CCValAssign::SExt) 1827 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 1828 else if (VA.getLocInfo() == CCValAssign::ZExt) 1829 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 1830 else if (VA.getLocInfo() == CCValAssign::AExt) 1831 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 1832 else if (VA.getLocInfo() == CCValAssign::BCvt) 1833 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); 1834 1835 // If this is x86-64, and we disabled SSE, we can't return FP values, 1836 // or SSE or MMX vectors. 1837 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1838 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1839 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1840 report_fatal_error("SSE register return with SSE disabled"); 1841 } 1842 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1843 // llvm-gcc has never done it right and no one has noticed, so this 1844 // should be OK for now. 1845 if (ValVT == MVT::f64 && 1846 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1847 report_fatal_error("SSE2 register return with SSE2 disabled"); 1848 1849 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1850 // the RET instruction and handled by the FP Stackifier. 1851 if (VA.getLocReg() == X86::ST0 || 1852 VA.getLocReg() == X86::ST1) { 1853 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1854 // change the value to the FP stack register class. 1855 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1856 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1857 RetOps.push_back(ValToCopy); 1858 // Don't emit a copytoreg. 1859 continue; 1860 } 1861 1862 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1863 // which is returned in RAX / RDX. 1864 if (Subtarget->is64Bit()) { 1865 if (ValVT == MVT::x86mmx) { 1866 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1867 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1868 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1869 ValToCopy); 1870 // If we don't have SSE2 available, convert to v4f32 so the generated 1871 // register is legal. 1872 if (!Subtarget->hasSSE2()) 1873 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1874 } 1875 } 1876 } 1877 1878 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1879 Flag = Chain.getValue(1); 1880 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1881 } 1882 1883 // The x86-64 ABIs require that for returning structs by value we copy 1884 // the sret argument into %rax/%eax (depending on ABI) for the return. 1885 // Win32 requires us to put the sret argument to %eax as well. 1886 // We saved the argument into a virtual register in the entry block, 1887 // so now we copy the value out and into %rax/%eax. 1888 if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && 1889 (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { 1890 MachineFunction &MF = DAG.getMachineFunction(); 1891 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1892 unsigned Reg = FuncInfo->getSRetReturnReg(); 1893 assert(Reg && 1894 "SRetReturnReg should have been set in LowerFormalArguments()."); 1895 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1896 1897 unsigned RetValReg 1898 = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? 1899 X86::RAX : X86::EAX; 1900 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); 1901 Flag = Chain.getValue(1); 1902 1903 // RAX/EAX now acts like a return value. 1904 RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); 1905 } 1906 1907 RetOps[0] = Chain; // Update chain. 1908 1909 // Add the flag if we have it. 1910 if (Flag.getNode()) 1911 RetOps.push_back(Flag); 1912 1913 return DAG.getNode(X86ISD::RET_FLAG, dl, 1914 MVT::Other, &RetOps[0], RetOps.size()); 1915} 1916 1917bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 1918 if (N->getNumValues() != 1) 1919 return false; 1920 if (!N->hasNUsesOfValue(1, 0)) 1921 return false; 1922 1923 SDValue TCChain = Chain; 1924 SDNode *Copy = *N->use_begin(); 1925 if (Copy->getOpcode() == ISD::CopyToReg) { 1926 // If the copy has a glue operand, we conservatively assume it isn't safe to 1927 // perform a tail call. 1928 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 1929 return false; 1930 TCChain = Copy->getOperand(0); 1931 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 1932 return false; 1933 1934 bool HasRet = false; 1935 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1936 UI != UE; ++UI) { 1937 if (UI->getOpcode() != X86ISD::RET_FLAG) 1938 return false; 1939 HasRet = true; 1940 } 1941 1942 if (!HasRet) 1943 return false; 1944 1945 Chain = TCChain; 1946 return true; 1947} 1948 1949MVT 1950X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, 1951 ISD::NodeType ExtendKind) const { 1952 MVT ReturnMVT; 1953 // TODO: Is this also valid on 32-bit? 1954 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1955 ReturnMVT = MVT::i8; 1956 else 1957 ReturnMVT = MVT::i32; 1958 1959 MVT MinVT = getRegisterType(ReturnMVT); 1960 return VT.bitsLT(MinVT) ? MinVT : VT; 1961} 1962 1963/// LowerCallResult - Lower the result values of a call into the 1964/// appropriate copies out of appropriate physical registers. 1965/// 1966SDValue 1967X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1968 CallingConv::ID CallConv, bool isVarArg, 1969 const SmallVectorImpl<ISD::InputArg> &Ins, 1970 SDLoc dl, SelectionDAG &DAG, 1971 SmallVectorImpl<SDValue> &InVals) const { 1972 1973 // Assign locations to each value returned by this call. 1974 SmallVector<CCValAssign, 16> RVLocs; 1975 bool Is64Bit = Subtarget->is64Bit(); 1976 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1977 getTargetMachine(), RVLocs, *DAG.getContext()); 1978 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1979 1980 // Copy all of the result registers out of their specified physreg. 1981 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 1982 CCValAssign &VA = RVLocs[i]; 1983 EVT CopyVT = VA.getValVT(); 1984 1985 // If this is x86-64, and we disabled SSE, we can't return FP values 1986 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1987 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1988 report_fatal_error("SSE register return with SSE disabled"); 1989 } 1990 1991 SDValue Val; 1992 1993 // If this is a call to a function that returns an fp value on the floating 1994 // point stack, we must guarantee the value is popped from the stack, so 1995 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1996 // if the return value is not used. We use the FpPOP_RETVAL instruction 1997 // instead. 1998 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1999 // If we prefer to use the value in xmm registers, copy it out as f80 and 2000 // use a truncate to move it from fp stack reg to xmm reg. 2001 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 2002 SDValue Ops[] = { Chain, InFlag }; 2003 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 2004 MVT::Other, MVT::Glue, Ops), 1); 2005 Val = Chain.getValue(0); 2006 2007 // Round the f80 to the right size, which also moves it to the appropriate 2008 // xmm register. 2009 if (CopyVT != VA.getValVT()) 2010 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 2011 // This truncation won't change the value. 2012 DAG.getIntPtrConstant(1)); 2013 } else { 2014 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 2015 CopyVT, InFlag).getValue(1); 2016 Val = Chain.getValue(0); 2017 } 2018 InFlag = Chain.getValue(2); 2019 InVals.push_back(Val); 2020 } 2021 2022 return Chain; 2023} 2024 2025//===----------------------------------------------------------------------===// 2026// C & StdCall & Fast Calling Convention implementation 2027//===----------------------------------------------------------------------===// 2028// StdCall calling convention seems to be standard for many Windows' API 2029// routines and around. It differs from C calling convention just a little: 2030// callee should clean up the stack, not caller. Symbols should be also 2031// decorated in some fancy way :) It doesn't support any vector arguments. 2032// For info on fast calling convention see Fast Calling Convention (tail call) 2033// implementation LowerX86_32FastCCCallTo. 2034 2035/// CallIsStructReturn - Determines whether a call uses struct return 2036/// semantics. 2037enum StructReturnType { 2038 NotStructReturn, 2039 RegStructReturn, 2040 StackStructReturn 2041}; 2042static StructReturnType 2043callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 2044 if (Outs.empty()) 2045 return NotStructReturn; 2046 2047 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 2048 if (!Flags.isSRet()) 2049 return NotStructReturn; 2050 if (Flags.isInReg()) 2051 return RegStructReturn; 2052 return StackStructReturn; 2053} 2054 2055/// ArgsAreStructReturn - Determines whether a function uses struct 2056/// return semantics. 2057static StructReturnType 2058argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 2059 if (Ins.empty()) 2060 return NotStructReturn; 2061 2062 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 2063 if (!Flags.isSRet()) 2064 return NotStructReturn; 2065 if (Flags.isInReg()) 2066 return RegStructReturn; 2067 return StackStructReturn; 2068} 2069 2070/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 2071/// by "Src" to address "Dst" with size and alignment information specified by 2072/// the specific parameter attribute. The copy will be passed as a byval 2073/// function parameter. 2074static SDValue 2075CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 2076 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 2077 SDLoc dl) { 2078 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 2079 2080 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 2081 /*isVolatile*/false, /*AlwaysInline=*/true, 2082 MachinePointerInfo(), MachinePointerInfo()); 2083} 2084 2085/// IsTailCallConvention - Return true if the calling convention is one that 2086/// supports tail call optimization. 2087static bool IsTailCallConvention(CallingConv::ID CC) { 2088 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 2089 CC == CallingConv::HiPE); 2090} 2091 2092/// \brief Return true if the calling convention is a C calling convention. 2093static bool IsCCallConvention(CallingConv::ID CC) { 2094 return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || 2095 CC == CallingConv::X86_64_SysV); 2096} 2097 2098bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 2099 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) 2100 return false; 2101 2102 CallSite CS(CI); 2103 CallingConv::ID CalleeCC = CS.getCallingConv(); 2104 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) 2105 return false; 2106 2107 return true; 2108} 2109 2110/// FuncIsMadeTailCallSafe - Return true if the function is being made into 2111/// a tailcall target by changing its ABI. 2112static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 2113 bool GuaranteedTailCallOpt) { 2114 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 2115} 2116 2117SDValue 2118X86TargetLowering::LowerMemArgument(SDValue Chain, 2119 CallingConv::ID CallConv, 2120 const SmallVectorImpl<ISD::InputArg> &Ins, 2121 SDLoc dl, SelectionDAG &DAG, 2122 const CCValAssign &VA, 2123 MachineFrameInfo *MFI, 2124 unsigned i) const { 2125 // Create the nodes corresponding to a load from this parameter slot. 2126 ISD::ArgFlagsTy Flags = Ins[i].Flags; 2127 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, 2128 getTargetMachine().Options.GuaranteedTailCallOpt); 2129 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 2130 EVT ValVT; 2131 2132 // If value is passed by pointer we have address passed instead of the value 2133 // itself. 2134 if (VA.getLocInfo() == CCValAssign::Indirect) 2135 ValVT = VA.getLocVT(); 2136 else 2137 ValVT = VA.getValVT(); 2138 2139 // FIXME: For now, all byval parameter objects are marked mutable. This can be 2140 // changed with more analysis. 2141 // In case of tail call optimization mark all arguments mutable. Since they 2142 // could be overwritten by lowering of arguments in case of a tail call. 2143 if (Flags.isByVal()) { 2144 unsigned Bytes = Flags.getByValSize(); 2145 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 2146 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 2147 return DAG.getFrameIndex(FI, getPointerTy()); 2148 } else { 2149 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 2150 VA.getLocMemOffset(), isImmutable); 2151 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 2152 return DAG.getLoad(ValVT, dl, Chain, FIN, 2153 MachinePointerInfo::getFixedStack(FI), 2154 false, false, false, 0); 2155 } 2156} 2157 2158SDValue 2159X86TargetLowering::LowerFormalArguments(SDValue Chain, 2160 CallingConv::ID CallConv, 2161 bool isVarArg, 2162 const SmallVectorImpl<ISD::InputArg> &Ins, 2163 SDLoc dl, 2164 SelectionDAG &DAG, 2165 SmallVectorImpl<SDValue> &InVals) 2166 const { 2167 MachineFunction &MF = DAG.getMachineFunction(); 2168 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2169 2170 const Function* Fn = MF.getFunction(); 2171 if (Fn->hasExternalLinkage() && 2172 Subtarget->isTargetCygMing() && 2173 Fn->getName() == "main") 2174 FuncInfo->setForceFramePointer(true); 2175 2176 MachineFrameInfo *MFI = MF.getFrameInfo(); 2177 bool Is64Bit = Subtarget->is64Bit(); 2178 bool IsWindows = Subtarget->isTargetWindows(); 2179 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 2180 2181 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2182 "Var args not supported with calling convention fastcc, ghc or hipe"); 2183 2184 // Assign locations to all of the incoming arguments. 2185 SmallVector<CCValAssign, 16> ArgLocs; 2186 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2187 ArgLocs, *DAG.getContext()); 2188 2189 // Allocate shadow area for Win64 2190 if (IsWin64) 2191 CCInfo.AllocateStack(32, 8); 2192 2193 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 2194 2195 unsigned LastVal = ~0U; 2196 SDValue ArgValue; 2197 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2198 CCValAssign &VA = ArgLocs[i]; 2199 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 2200 // places. 2201 assert(VA.getValNo() != LastVal && 2202 "Don't support value assigned to multiple locs yet"); 2203 (void)LastVal; 2204 LastVal = VA.getValNo(); 2205 2206 if (VA.isRegLoc()) { 2207 EVT RegVT = VA.getLocVT(); 2208 const TargetRegisterClass *RC; 2209 if (RegVT == MVT::i32) 2210 RC = &X86::GR32RegClass; 2211 else if (Is64Bit && RegVT == MVT::i64) 2212 RC = &X86::GR64RegClass; 2213 else if (RegVT == MVT::f32) 2214 RC = &X86::FR32RegClass; 2215 else if (RegVT == MVT::f64) 2216 RC = &X86::FR64RegClass; 2217 else if (RegVT.is512BitVector()) 2218 RC = &X86::VR512RegClass; 2219 else if (RegVT.is256BitVector()) 2220 RC = &X86::VR256RegClass; 2221 else if (RegVT.is128BitVector()) 2222 RC = &X86::VR128RegClass; 2223 else if (RegVT == MVT::x86mmx) 2224 RC = &X86::VR64RegClass; 2225 else if (RegVT == MVT::v8i1) 2226 RC = &X86::VK8RegClass; 2227 else if (RegVT == MVT::v16i1) 2228 RC = &X86::VK16RegClass; 2229 else 2230 llvm_unreachable("Unknown argument type!"); 2231 2232 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2233 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 2234 2235 // If this is an 8 or 16-bit value, it is really passed promoted to 32 2236 // bits. Insert an assert[sz]ext to capture this, then truncate to the 2237 // right size. 2238 if (VA.getLocInfo() == CCValAssign::SExt) 2239 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 2240 DAG.getValueType(VA.getValVT())); 2241 else if (VA.getLocInfo() == CCValAssign::ZExt) 2242 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 2243 DAG.getValueType(VA.getValVT())); 2244 else if (VA.getLocInfo() == CCValAssign::BCvt) 2245 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 2246 2247 if (VA.isExtInLoc()) { 2248 // Handle MMX values passed in XMM regs. 2249 if (RegVT.isVector()) 2250 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 2251 else 2252 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 2253 } 2254 } else { 2255 assert(VA.isMemLoc()); 2256 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 2257 } 2258 2259 // If value is passed via pointer - do a load. 2260 if (VA.getLocInfo() == CCValAssign::Indirect) 2261 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 2262 MachinePointerInfo(), false, false, false, 0); 2263 2264 InVals.push_back(ArgValue); 2265 } 2266 2267 // The x86-64 ABIs require that for returning structs by value we copy 2268 // the sret argument into %rax/%eax (depending on ABI) for the return. 2269 // Win32 requires us to put the sret argument to %eax as well. 2270 // Save the argument into a virtual register so that we can access it 2271 // from the return points. 2272 if (MF.getFunction()->hasStructRetAttr() && 2273 (Subtarget->is64Bit() || Subtarget->isTargetWindows())) { 2274 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2275 unsigned Reg = FuncInfo->getSRetReturnReg(); 2276 if (!Reg) { 2277 MVT PtrTy = getPointerTy(); 2278 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 2279 FuncInfo->setSRetReturnReg(Reg); 2280 } 2281 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 2282 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 2283 } 2284 2285 unsigned StackSize = CCInfo.getNextStackOffset(); 2286 // Align stack specially for tail calls. 2287 if (FuncIsMadeTailCallSafe(CallConv, 2288 MF.getTarget().Options.GuaranteedTailCallOpt)) 2289 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 2290 2291 // If the function takes variable number of arguments, make a frame index for 2292 // the start of the first vararg value... for expansion of llvm.va_start. 2293 if (isVarArg) { 2294 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 2295 CallConv != CallingConv::X86_ThisCall)) { 2296 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 2297 } 2298 if (Is64Bit) { 2299 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 2300 2301 // FIXME: We should really autogenerate these arrays 2302 static const uint16_t GPR64ArgRegsWin64[] = { 2303 X86::RCX, X86::RDX, X86::R8, X86::R9 2304 }; 2305 static const uint16_t GPR64ArgRegs64Bit[] = { 2306 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 2307 }; 2308 static const uint16_t XMMArgRegs64Bit[] = { 2309 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2310 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2311 }; 2312 const uint16_t *GPR64ArgRegs; 2313 unsigned NumXMMRegs = 0; 2314 2315 if (IsWin64) { 2316 // The XMM registers which might contain var arg parameters are shadowed 2317 // in their paired GPR. So we only need to save the GPR to their home 2318 // slots. 2319 TotalNumIntRegs = 4; 2320 GPR64ArgRegs = GPR64ArgRegsWin64; 2321 } else { 2322 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 2323 GPR64ArgRegs = GPR64ArgRegs64Bit; 2324 2325 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, 2326 TotalNumXMMRegs); 2327 } 2328 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 2329 TotalNumIntRegs); 2330 2331 bool NoImplicitFloatOps = Fn->getAttributes(). 2332 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); 2333 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 2334 "SSE register cannot be used when SSE is disabled!"); 2335 assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && 2336 NoImplicitFloatOps) && 2337 "SSE register cannot be used when SSE is disabled!"); 2338 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 2339 !Subtarget->hasSSE1()) 2340 // Kernel mode asks for SSE to be disabled, so don't push them 2341 // on the stack. 2342 TotalNumXMMRegs = 0; 2343 2344 if (IsWin64) { 2345 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 2346 // Get to the caller-allocated home save location. Add 8 to account 2347 // for the return address. 2348 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2349 FuncInfo->setRegSaveFrameIndex( 2350 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 2351 // Fixup to set vararg frame on shadow area (4 x i64). 2352 if (NumIntRegs < 4) 2353 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 2354 } else { 2355 // For X86-64, if there are vararg parameters that are passed via 2356 // registers, then we must store them to their spots on the stack so 2357 // they may be loaded by deferencing the result of va_next. 2358 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 2359 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 2360 FuncInfo->setRegSaveFrameIndex( 2361 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 2362 false)); 2363 } 2364 2365 // Store the integer parameter registers. 2366 SmallVector<SDValue, 8> MemOps; 2367 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 2368 getPointerTy()); 2369 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 2370 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 2371 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 2372 DAG.getIntPtrConstant(Offset)); 2373 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 2374 &X86::GR64RegClass); 2375 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 2376 SDValue Store = 2377 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2378 MachinePointerInfo::getFixedStack( 2379 FuncInfo->getRegSaveFrameIndex(), Offset), 2380 false, false, 0); 2381 MemOps.push_back(Store); 2382 Offset += 8; 2383 } 2384 2385 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 2386 // Now store the XMM (fp + vector) parameter registers. 2387 SmallVector<SDValue, 11> SaveXMMOps; 2388 SaveXMMOps.push_back(Chain); 2389 2390 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2391 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 2392 SaveXMMOps.push_back(ALVal); 2393 2394 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2395 FuncInfo->getRegSaveFrameIndex())); 2396 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2397 FuncInfo->getVarArgsFPOffset())); 2398 2399 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 2400 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 2401 &X86::VR128RegClass); 2402 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 2403 SaveXMMOps.push_back(Val); 2404 } 2405 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2406 MVT::Other, 2407 &SaveXMMOps[0], SaveXMMOps.size())); 2408 } 2409 2410 if (!MemOps.empty()) 2411 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2412 &MemOps[0], MemOps.size()); 2413 } 2414 } 2415 2416 // Some CCs need callee pop. 2417 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2418 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2419 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2420 } else { 2421 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2422 // If this is an sret function, the return should pop the hidden pointer. 2423 if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2424 argsAreStructReturn(Ins) == StackStructReturn) 2425 FuncInfo->setBytesToPopOnReturn(4); 2426 } 2427 2428 if (!Is64Bit) { 2429 // RegSaveFrameIndex is X86-64 only. 2430 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2431 if (CallConv == CallingConv::X86_FastCall || 2432 CallConv == CallingConv::X86_ThisCall) 2433 // fastcc functions can't have varargs. 2434 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2435 } 2436 2437 FuncInfo->setArgumentStackSize(StackSize); 2438 2439 return Chain; 2440} 2441 2442SDValue 2443X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2444 SDValue StackPtr, SDValue Arg, 2445 SDLoc dl, SelectionDAG &DAG, 2446 const CCValAssign &VA, 2447 ISD::ArgFlagsTy Flags) const { 2448 unsigned LocMemOffset = VA.getLocMemOffset(); 2449 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 2450 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 2451 if (Flags.isByVal()) 2452 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2453 2454 return DAG.getStore(Chain, dl, Arg, PtrOff, 2455 MachinePointerInfo::getStack(LocMemOffset), 2456 false, false, 0); 2457} 2458 2459/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 2460/// optimization is performed and it is required. 2461SDValue 2462X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 2463 SDValue &OutRetAddr, SDValue Chain, 2464 bool IsTailCall, bool Is64Bit, 2465 int FPDiff, SDLoc dl) const { 2466 // Adjust the Return address stack slot. 2467 EVT VT = getPointerTy(); 2468 OutRetAddr = getReturnAddressFrameIndex(DAG); 2469 2470 // Load the "old" Return address. 2471 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2472 false, false, false, 0); 2473 return SDValue(OutRetAddr.getNode(), 1); 2474} 2475 2476/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 2477/// optimization is performed and it is required (FPDiff!=0). 2478static SDValue 2479EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 2480 SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, 2481 unsigned SlotSize, int FPDiff, SDLoc dl) { 2482 // Store the return address to the appropriate stack slot. 2483 if (!FPDiff) return Chain; 2484 // Calculate the new stack slot for the return address. 2485 int NewReturnAddrFI = 2486 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, 2487 false); 2488 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 2489 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2490 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 2491 false, false, 0); 2492 return Chain; 2493} 2494 2495SDValue 2496X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2497 SmallVectorImpl<SDValue> &InVals) const { 2498 SelectionDAG &DAG = CLI.DAG; 2499 SDLoc &dl = CLI.DL; 2500 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2501 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2502 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2503 SDValue Chain = CLI.Chain; 2504 SDValue Callee = CLI.Callee; 2505 CallingConv::ID CallConv = CLI.CallConv; 2506 bool &isTailCall = CLI.IsTailCall; 2507 bool isVarArg = CLI.IsVarArg; 2508 2509 MachineFunction &MF = DAG.getMachineFunction(); 2510 bool Is64Bit = Subtarget->is64Bit(); 2511 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 2512 bool IsWindows = Subtarget->isTargetWindows(); 2513 StructReturnType SR = callIsStructReturn(Outs); 2514 bool IsSibcall = false; 2515 2516 if (MF.getTarget().Options.DisableTailCalls) 2517 isTailCall = false; 2518 2519 if (isTailCall) { 2520 // Check if it's really possible to do a tail call. 2521 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2522 isVarArg, SR != NotStructReturn, 2523 MF.getFunction()->hasStructRetAttr(), CLI.RetTy, 2524 Outs, OutVals, Ins, DAG); 2525 2526 // Sibcalls are automatically detected tailcalls which do not require 2527 // ABI changes. 2528 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 2529 IsSibcall = true; 2530 2531 if (isTailCall) 2532 ++NumTailCalls; 2533 } 2534 2535 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2536 "Var args not supported with calling convention fastcc, ghc or hipe"); 2537 2538 // Analyze operands of the call, assigning locations to each operand. 2539 SmallVector<CCValAssign, 16> ArgLocs; 2540 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2541 ArgLocs, *DAG.getContext()); 2542 2543 // Allocate shadow area for Win64 2544 if (IsWin64) 2545 CCInfo.AllocateStack(32, 8); 2546 2547 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2548 2549 // Get a count of how many bytes are to be pushed on the stack. 2550 unsigned NumBytes = CCInfo.getNextStackOffset(); 2551 if (IsSibcall) 2552 // This is a sibcall. The memory operands are available in caller's 2553 // own caller's stack. 2554 NumBytes = 0; 2555 else if (getTargetMachine().Options.GuaranteedTailCallOpt && 2556 IsTailCallConvention(CallConv)) 2557 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2558 2559 int FPDiff = 0; 2560 if (isTailCall && !IsSibcall) { 2561 // Lower arguments at fp - stackoffset + fpdiff. 2562 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 2563 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 2564 2565 FPDiff = NumBytesCallerPushed - NumBytes; 2566 2567 // Set the delta of movement of the returnaddr stackslot. 2568 // But only set if delta is greater than previous delta. 2569 if (FPDiff < X86Info->getTCReturnAddrDelta()) 2570 X86Info->setTCReturnAddrDelta(FPDiff); 2571 } 2572 2573 if (!IsSibcall) 2574 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), 2575 dl); 2576 2577 SDValue RetAddrFrIdx; 2578 // Load return address for tail calls. 2579 if (isTailCall && FPDiff) 2580 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2581 Is64Bit, FPDiff, dl); 2582 2583 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2584 SmallVector<SDValue, 8> MemOpChains; 2585 SDValue StackPtr; 2586 2587 // Walk the register/memloc assignments, inserting copies/loads. In the case 2588 // of tail call optimization arguments are handle later. 2589 const X86RegisterInfo *RegInfo = 2590 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 2591 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2592 CCValAssign &VA = ArgLocs[i]; 2593 EVT RegVT = VA.getLocVT(); 2594 SDValue Arg = OutVals[i]; 2595 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2596 bool isByVal = Flags.isByVal(); 2597 2598 // Promote the value if needed. 2599 switch (VA.getLocInfo()) { 2600 default: llvm_unreachable("Unknown loc info!"); 2601 case CCValAssign::Full: break; 2602 case CCValAssign::SExt: 2603 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2604 break; 2605 case CCValAssign::ZExt: 2606 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2607 break; 2608 case CCValAssign::AExt: 2609 if (RegVT.is128BitVector()) { 2610 // Special case: passing MMX values in XMM registers. 2611 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2612 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2613 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2614 } else 2615 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2616 break; 2617 case CCValAssign::BCvt: 2618 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2619 break; 2620 case CCValAssign::Indirect: { 2621 // Store the argument. 2622 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2623 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2624 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2625 MachinePointerInfo::getFixedStack(FI), 2626 false, false, 0); 2627 Arg = SpillSlot; 2628 break; 2629 } 2630 } 2631 2632 if (VA.isRegLoc()) { 2633 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2634 if (isVarArg && IsWin64) { 2635 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2636 // shadow reg if callee is a varargs function. 2637 unsigned ShadowReg = 0; 2638 switch (VA.getLocReg()) { 2639 case X86::XMM0: ShadowReg = X86::RCX; break; 2640 case X86::XMM1: ShadowReg = X86::RDX; break; 2641 case X86::XMM2: ShadowReg = X86::R8; break; 2642 case X86::XMM3: ShadowReg = X86::R9; break; 2643 } 2644 if (ShadowReg) 2645 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2646 } 2647 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2648 assert(VA.isMemLoc()); 2649 if (StackPtr.getNode() == 0) 2650 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2651 getPointerTy()); 2652 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2653 dl, DAG, VA, Flags)); 2654 } 2655 } 2656 2657 if (!MemOpChains.empty()) 2658 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2659 &MemOpChains[0], MemOpChains.size()); 2660 2661 if (Subtarget->isPICStyleGOT()) { 2662 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2663 // GOT pointer. 2664 if (!isTailCall) { 2665 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), 2666 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); 2667 } else { 2668 // If we are tail calling and generating PIC/GOT style code load the 2669 // address of the callee into ECX. The value in ecx is used as target of 2670 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2671 // for tail calls on PIC/GOT architectures. Normally we would just put the 2672 // address of GOT into ebx and then call target@PLT. But for tail calls 2673 // ebx would be restored (since ebx is callee saved) before jumping to the 2674 // target@PLT. 2675 2676 // Note: The actual moving to ECX is done further down. 2677 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2678 if (G && !G->getGlobal()->hasHiddenVisibility() && 2679 !G->getGlobal()->hasProtectedVisibility()) 2680 Callee = LowerGlobalAddress(Callee, DAG); 2681 else if (isa<ExternalSymbolSDNode>(Callee)) 2682 Callee = LowerExternalSymbol(Callee, DAG); 2683 } 2684 } 2685 2686 if (Is64Bit && isVarArg && !IsWin64) { 2687 // From AMD64 ABI document: 2688 // For calls that may call functions that use varargs or stdargs 2689 // (prototype-less calls or calls to functions containing ellipsis (...) in 2690 // the declaration) %al is used as hidden argument to specify the number 2691 // of SSE registers used. The contents of %al do not need to match exactly 2692 // the number of registers, but must be an ubound on the number of SSE 2693 // registers used and is in the range 0 - 8 inclusive. 2694 2695 // Count the number of XMM registers allocated. 2696 static const uint16_t XMMArgRegs[] = { 2697 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2698 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2699 }; 2700 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2701 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2702 && "SSE registers cannot be used when SSE is disabled"); 2703 2704 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 2705 DAG.getConstant(NumXMMRegs, MVT::i8))); 2706 } 2707 2708 // For tail calls lower the arguments to the 'real' stack slot. 2709 if (isTailCall) { 2710 // Force all the incoming stack arguments to be loaded from the stack 2711 // before any new outgoing arguments are stored to the stack, because the 2712 // outgoing stack slots may alias the incoming argument stack slots, and 2713 // the alias isn't otherwise explicit. This is slightly more conservative 2714 // than necessary, because it means that each store effectively depends 2715 // on every argument instead of just those arguments it would clobber. 2716 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2717 2718 SmallVector<SDValue, 8> MemOpChains2; 2719 SDValue FIN; 2720 int FI = 0; 2721 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2722 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2723 CCValAssign &VA = ArgLocs[i]; 2724 if (VA.isRegLoc()) 2725 continue; 2726 assert(VA.isMemLoc()); 2727 SDValue Arg = OutVals[i]; 2728 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2729 // Create frame index. 2730 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2731 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2732 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2733 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2734 2735 if (Flags.isByVal()) { 2736 // Copy relative to framepointer. 2737 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2738 if (StackPtr.getNode() == 0) 2739 StackPtr = DAG.getCopyFromReg(Chain, dl, 2740 RegInfo->getStackRegister(), 2741 getPointerTy()); 2742 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2743 2744 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2745 ArgChain, 2746 Flags, DAG, dl)); 2747 } else { 2748 // Store relative to framepointer. 2749 MemOpChains2.push_back( 2750 DAG.getStore(ArgChain, dl, Arg, FIN, 2751 MachinePointerInfo::getFixedStack(FI), 2752 false, false, 0)); 2753 } 2754 } 2755 } 2756 2757 if (!MemOpChains2.empty()) 2758 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2759 &MemOpChains2[0], MemOpChains2.size()); 2760 2761 // Store the return address to the appropriate stack slot. 2762 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 2763 getPointerTy(), RegInfo->getSlotSize(), 2764 FPDiff, dl); 2765 } 2766 2767 // Build a sequence of copy-to-reg nodes chained together with token chain 2768 // and flag operands which copy the outgoing args into registers. 2769 SDValue InFlag; 2770 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2771 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2772 RegsToPass[i].second, InFlag); 2773 InFlag = Chain.getValue(1); 2774 } 2775 2776 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2777 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2778 // In the 64-bit large code model, we have to make all calls 2779 // through a register, since the call instruction's 32-bit 2780 // pc-relative offset may not be large enough to hold the whole 2781 // address. 2782 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2783 // If the callee is a GlobalAddress node (quite common, every direct call 2784 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2785 // it. 2786 2787 // We should use extra load for direct calls to dllimported functions in 2788 // non-JIT mode. 2789 const GlobalValue *GV = G->getGlobal(); 2790 if (!GV->hasDLLImportLinkage()) { 2791 unsigned char OpFlags = 0; 2792 bool ExtraLoad = false; 2793 unsigned WrapperKind = ISD::DELETED_NODE; 2794 2795 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2796 // external symbols most go through the PLT in PIC mode. If the symbol 2797 // has hidden or protected visibility, or if it is static or local, then 2798 // we don't need to use the PLT - we can directly call it. 2799 if (Subtarget->isTargetELF() && 2800 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2801 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2802 OpFlags = X86II::MO_PLT; 2803 } else if (Subtarget->isPICStyleStubAny() && 2804 (GV->isDeclaration() || GV->isWeakForLinker()) && 2805 (!Subtarget->getTargetTriple().isMacOSX() || 2806 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2807 // PC-relative references to external symbols should go through $stub, 2808 // unless we're building with the leopard linker or later, which 2809 // automatically synthesizes these stubs. 2810 OpFlags = X86II::MO_DARWIN_STUB; 2811 } else if (Subtarget->isPICStyleRIPRel() && 2812 isa<Function>(GV) && 2813 cast<Function>(GV)->getAttributes(). 2814 hasAttribute(AttributeSet::FunctionIndex, 2815 Attribute::NonLazyBind)) { 2816 // If the function is marked as non-lazy, generate an indirect call 2817 // which loads from the GOT directly. This avoids runtime overhead 2818 // at the cost of eager binding (and one extra byte of encoding). 2819 OpFlags = X86II::MO_GOTPCREL; 2820 WrapperKind = X86ISD::WrapperRIP; 2821 ExtraLoad = true; 2822 } 2823 2824 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2825 G->getOffset(), OpFlags); 2826 2827 // Add a wrapper if needed. 2828 if (WrapperKind != ISD::DELETED_NODE) 2829 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2830 // Add extra indirection if needed. 2831 if (ExtraLoad) 2832 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2833 MachinePointerInfo::getGOT(), 2834 false, false, false, 0); 2835 } 2836 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2837 unsigned char OpFlags = 0; 2838 2839 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2840 // external symbols should go through the PLT. 2841 if (Subtarget->isTargetELF() && 2842 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2843 OpFlags = X86II::MO_PLT; 2844 } else if (Subtarget->isPICStyleStubAny() && 2845 (!Subtarget->getTargetTriple().isMacOSX() || 2846 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2847 // PC-relative references to external symbols should go through $stub, 2848 // unless we're building with the leopard linker or later, which 2849 // automatically synthesizes these stubs. 2850 OpFlags = X86II::MO_DARWIN_STUB; 2851 } 2852 2853 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2854 OpFlags); 2855 } 2856 2857 // Returns a chain & a flag for retval copy to use. 2858 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2859 SmallVector<SDValue, 8> Ops; 2860 2861 if (!IsSibcall && isTailCall) { 2862 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2863 DAG.getIntPtrConstant(0, true), InFlag, dl); 2864 InFlag = Chain.getValue(1); 2865 } 2866 2867 Ops.push_back(Chain); 2868 Ops.push_back(Callee); 2869 2870 if (isTailCall) 2871 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2872 2873 // Add argument registers to the end of the list so that they are known live 2874 // into the call. 2875 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2876 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2877 RegsToPass[i].second.getValueType())); 2878 2879 // Add a register mask operand representing the call-preserved registers. 2880 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 2881 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 2882 assert(Mask && "Missing call preserved mask for calling convention"); 2883 Ops.push_back(DAG.getRegisterMask(Mask)); 2884 2885 if (InFlag.getNode()) 2886 Ops.push_back(InFlag); 2887 2888 if (isTailCall) { 2889 // We used to do: 2890 //// If this is the first return lowered for this function, add the regs 2891 //// to the liveout set for the function. 2892 // This isn't right, although it's probably harmless on x86; liveouts 2893 // should be computed from returns not tail calls. Consider a void 2894 // function making a tail call to a function returning int. 2895 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); 2896 } 2897 2898 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2899 InFlag = Chain.getValue(1); 2900 2901 // Create the CALLSEQ_END node. 2902 unsigned NumBytesForCalleeToPush; 2903 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2904 getTargetMachine().Options.GuaranteedTailCallOpt)) 2905 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2906 else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2907 SR == StackStructReturn) 2908 // If this is a call to a struct-return function, the callee 2909 // pops the hidden struct pointer, so we have to push it back. 2910 // This is common for Darwin/X86, Linux & Mingw32 targets. 2911 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 2912 NumBytesForCalleeToPush = 4; 2913 else 2914 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2915 2916 // Returns a flag for retval copy to use. 2917 if (!IsSibcall) { 2918 Chain = DAG.getCALLSEQ_END(Chain, 2919 DAG.getIntPtrConstant(NumBytes, true), 2920 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2921 true), 2922 InFlag, dl); 2923 InFlag = Chain.getValue(1); 2924 } 2925 2926 // Handle result values, copying them out of physregs into vregs that we 2927 // return. 2928 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2929 Ins, dl, DAG, InVals); 2930} 2931 2932//===----------------------------------------------------------------------===// 2933// Fast Calling Convention (tail call) implementation 2934//===----------------------------------------------------------------------===// 2935 2936// Like std call, callee cleans arguments, convention except that ECX is 2937// reserved for storing the tail called function address. Only 2 registers are 2938// free for argument passing (inreg). Tail call optimization is performed 2939// provided: 2940// * tailcallopt is enabled 2941// * caller/callee are fastcc 2942// On X86_64 architecture with GOT-style position independent code only local 2943// (within module) calls are supported at the moment. 2944// To keep the stack aligned according to platform abi the function 2945// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2946// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2947// If a tail called function callee has more arguments than the caller the 2948// caller needs to make sure that there is room to move the RETADDR to. This is 2949// achieved by reserving an area the size of the argument delta right after the 2950// original REtADDR, but before the saved framepointer or the spilled registers 2951// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2952// stack layout: 2953// arg1 2954// arg2 2955// RETADDR 2956// [ new RETADDR 2957// move area ] 2958// (possible EBP) 2959// ESI 2960// EDI 2961// local1 .. 2962 2963/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2964/// for a 16 byte align requirement. 2965unsigned 2966X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2967 SelectionDAG& DAG) const { 2968 MachineFunction &MF = DAG.getMachineFunction(); 2969 const TargetMachine &TM = MF.getTarget(); 2970 const X86RegisterInfo *RegInfo = 2971 static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); 2972 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2973 unsigned StackAlignment = TFI.getStackAlignment(); 2974 uint64_t AlignMask = StackAlignment - 1; 2975 int64_t Offset = StackSize; 2976 unsigned SlotSize = RegInfo->getSlotSize(); 2977 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2978 // Number smaller than 12 so just add the difference. 2979 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2980 } else { 2981 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2982 Offset = ((~AlignMask) & Offset) + StackAlignment + 2983 (StackAlignment-SlotSize); 2984 } 2985 return Offset; 2986} 2987 2988/// MatchingStackOffset - Return true if the given stack call argument is 2989/// already available in the same position (relatively) of the caller's 2990/// incoming argument stack. 2991static 2992bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2993 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2994 const X86InstrInfo *TII) { 2995 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2996 int FI = INT_MAX; 2997 if (Arg.getOpcode() == ISD::CopyFromReg) { 2998 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2999 if (!TargetRegisterInfo::isVirtualRegister(VR)) 3000 return false; 3001 MachineInstr *Def = MRI->getVRegDef(VR); 3002 if (!Def) 3003 return false; 3004 if (!Flags.isByVal()) { 3005 if (!TII->isLoadFromStackSlot(Def, FI)) 3006 return false; 3007 } else { 3008 unsigned Opcode = Def->getOpcode(); 3009 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 3010 Def->getOperand(1).isFI()) { 3011 FI = Def->getOperand(1).getIndex(); 3012 Bytes = Flags.getByValSize(); 3013 } else 3014 return false; 3015 } 3016 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 3017 if (Flags.isByVal()) 3018 // ByVal argument is passed in as a pointer but it's now being 3019 // dereferenced. e.g. 3020 // define @foo(%struct.X* %A) { 3021 // tail call @bar(%struct.X* byval %A) 3022 // } 3023 return false; 3024 SDValue Ptr = Ld->getBasePtr(); 3025 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 3026 if (!FINode) 3027 return false; 3028 FI = FINode->getIndex(); 3029 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 3030 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 3031 FI = FINode->getIndex(); 3032 Bytes = Flags.getByValSize(); 3033 } else 3034 return false; 3035 3036 assert(FI != INT_MAX); 3037 if (!MFI->isFixedObjectIndex(FI)) 3038 return false; 3039 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 3040} 3041 3042/// IsEligibleForTailCallOptimization - Check whether the call is eligible 3043/// for tail call optimization. Targets which want to do tail call 3044/// optimization should implement this function. 3045bool 3046X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 3047 CallingConv::ID CalleeCC, 3048 bool isVarArg, 3049 bool isCalleeStructRet, 3050 bool isCallerStructRet, 3051 Type *RetTy, 3052 const SmallVectorImpl<ISD::OutputArg> &Outs, 3053 const SmallVectorImpl<SDValue> &OutVals, 3054 const SmallVectorImpl<ISD::InputArg> &Ins, 3055 SelectionDAG &DAG) const { 3056 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) 3057 return false; 3058 3059 // If -tailcallopt is specified, make fastcc functions tail-callable. 3060 const MachineFunction &MF = DAG.getMachineFunction(); 3061 const Function *CallerF = MF.getFunction(); 3062 3063 // If the function return type is x86_fp80 and the callee return type is not, 3064 // then the FP_EXTEND of the call result is not a nop. It's not safe to 3065 // perform a tailcall optimization here. 3066 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 3067 return false; 3068 3069 CallingConv::ID CallerCC = CallerF->getCallingConv(); 3070 bool CCMatch = CallerCC == CalleeCC; 3071 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); 3072 bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); 3073 3074 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 3075 if (IsTailCallConvention(CalleeCC) && CCMatch) 3076 return true; 3077 return false; 3078 } 3079 3080 // Look for obvious safe cases to perform tail call optimization that do not 3081 // require ABI changes. This is what gcc calls sibcall. 3082 3083 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 3084 // emit a special epilogue. 3085 const X86RegisterInfo *RegInfo = 3086 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 3087 if (RegInfo->needsStackRealignment(MF)) 3088 return false; 3089 3090 // Also avoid sibcall optimization if either caller or callee uses struct 3091 // return semantics. 3092 if (isCalleeStructRet || isCallerStructRet) 3093 return false; 3094 3095 // An stdcall caller is expected to clean up its arguments; the callee 3096 // isn't going to do that. 3097 if (!CCMatch && CallerCC == CallingConv::X86_StdCall) 3098 return false; 3099 3100 // Do not sibcall optimize vararg calls unless all arguments are passed via 3101 // registers. 3102 if (isVarArg && !Outs.empty()) { 3103 3104 // Optimizing for varargs on Win64 is unlikely to be safe without 3105 // additional testing. 3106 if (IsCalleeWin64 || IsCallerWin64) 3107 return false; 3108 3109 SmallVector<CCValAssign, 16> ArgLocs; 3110 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 3111 getTargetMachine(), ArgLocs, *DAG.getContext()); 3112 3113 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3114 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 3115 if (!ArgLocs[i].isRegLoc()) 3116 return false; 3117 } 3118 3119 // If the call result is in ST0 / ST1, it needs to be popped off the x87 3120 // stack. Therefore, if it's not used by the call it is not safe to optimize 3121 // this into a sibcall. 3122 bool Unused = false; 3123 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 3124 if (!Ins[i].Used) { 3125 Unused = true; 3126 break; 3127 } 3128 } 3129 if (Unused) { 3130 SmallVector<CCValAssign, 16> RVLocs; 3131 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 3132 getTargetMachine(), RVLocs, *DAG.getContext()); 3133 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 3134 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 3135 CCValAssign &VA = RVLocs[i]; 3136 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 3137 return false; 3138 } 3139 } 3140 3141 // If the calling conventions do not match, then we'd better make sure the 3142 // results are returned in the same way as what the caller expects. 3143 if (!CCMatch) { 3144 SmallVector<CCValAssign, 16> RVLocs1; 3145 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 3146 getTargetMachine(), RVLocs1, *DAG.getContext()); 3147 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 3148 3149 SmallVector<CCValAssign, 16> RVLocs2; 3150 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 3151 getTargetMachine(), RVLocs2, *DAG.getContext()); 3152 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 3153 3154 if (RVLocs1.size() != RVLocs2.size()) 3155 return false; 3156 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 3157 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 3158 return false; 3159 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 3160 return false; 3161 if (RVLocs1[i].isRegLoc()) { 3162 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 3163 return false; 3164 } else { 3165 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 3166 return false; 3167 } 3168 } 3169 } 3170 3171 // If the callee takes no arguments then go on to check the results of the 3172 // call. 3173 if (!Outs.empty()) { 3174 // Check if stack adjustment is needed. For now, do not do this if any 3175 // argument is passed on the stack. 3176 SmallVector<CCValAssign, 16> ArgLocs; 3177 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 3178 getTargetMachine(), ArgLocs, *DAG.getContext()); 3179 3180 // Allocate shadow area for Win64 3181 if (IsCalleeWin64) 3182 CCInfo.AllocateStack(32, 8); 3183 3184 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 3185 if (CCInfo.getNextStackOffset()) { 3186 MachineFunction &MF = DAG.getMachineFunction(); 3187 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 3188 return false; 3189 3190 // Check if the arguments are already laid out in the right way as 3191 // the caller's fixed stack objects. 3192 MachineFrameInfo *MFI = MF.getFrameInfo(); 3193 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 3194 const X86InstrInfo *TII = 3195 ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); 3196 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3197 CCValAssign &VA = ArgLocs[i]; 3198 SDValue Arg = OutVals[i]; 3199 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3200 if (VA.getLocInfo() == CCValAssign::Indirect) 3201 return false; 3202 if (!VA.isRegLoc()) { 3203 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 3204 MFI, MRI, TII)) 3205 return false; 3206 } 3207 } 3208 } 3209 3210 // If the tailcall address may be in a register, then make sure it's 3211 // possible to register allocate for it. In 32-bit, the call address can 3212 // only target EAX, EDX, or ECX since the tail call must be scheduled after 3213 // callee-saved registers are restored. These happen to be the same 3214 // registers used to pass 'inreg' arguments so watch out for those. 3215 if (!Subtarget->is64Bit() && 3216 ((!isa<GlobalAddressSDNode>(Callee) && 3217 !isa<ExternalSymbolSDNode>(Callee)) || 3218 getTargetMachine().getRelocationModel() == Reloc::PIC_)) { 3219 unsigned NumInRegs = 0; 3220 // In PIC we need an extra register to formulate the address computation 3221 // for the callee. 3222 unsigned MaxInRegs = 3223 (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3; 3224 3225 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3226 CCValAssign &VA = ArgLocs[i]; 3227 if (!VA.isRegLoc()) 3228 continue; 3229 unsigned Reg = VA.getLocReg(); 3230 switch (Reg) { 3231 default: break; 3232 case X86::EAX: case X86::EDX: case X86::ECX: 3233 if (++NumInRegs == MaxInRegs) 3234 return false; 3235 break; 3236 } 3237 } 3238 } 3239 } 3240 3241 return true; 3242} 3243 3244FastISel * 3245X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 3246 const TargetLibraryInfo *libInfo) const { 3247 return X86::createFastISel(funcInfo, libInfo); 3248} 3249 3250//===----------------------------------------------------------------------===// 3251// Other Lowering Hooks 3252//===----------------------------------------------------------------------===// 3253 3254static bool MayFoldLoad(SDValue Op) { 3255 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 3256} 3257 3258static bool MayFoldIntoStore(SDValue Op) { 3259 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 3260} 3261 3262static bool isTargetShuffle(unsigned Opcode) { 3263 switch(Opcode) { 3264 default: return false; 3265 case X86ISD::PSHUFD: 3266 case X86ISD::PSHUFHW: 3267 case X86ISD::PSHUFLW: 3268 case X86ISD::SHUFP: 3269 case X86ISD::PALIGNR: 3270 case X86ISD::MOVLHPS: 3271 case X86ISD::MOVLHPD: 3272 case X86ISD::MOVHLPS: 3273 case X86ISD::MOVLPS: 3274 case X86ISD::MOVLPD: 3275 case X86ISD::MOVSHDUP: 3276 case X86ISD::MOVSLDUP: 3277 case X86ISD::MOVDDUP: 3278 case X86ISD::MOVSS: 3279 case X86ISD::MOVSD: 3280 case X86ISD::UNPCKL: 3281 case X86ISD::UNPCKH: 3282 case X86ISD::VPERMILP: 3283 case X86ISD::VPERM2X128: 3284 case X86ISD::VPERMI: 3285 return true; 3286 } 3287} 3288 3289static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3290 SDValue V1, SelectionDAG &DAG) { 3291 switch(Opc) { 3292 default: llvm_unreachable("Unknown x86 shuffle node"); 3293 case X86ISD::MOVSHDUP: 3294 case X86ISD::MOVSLDUP: 3295 case X86ISD::MOVDDUP: 3296 return DAG.getNode(Opc, dl, VT, V1); 3297 } 3298} 3299 3300static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3301 SDValue V1, unsigned TargetMask, 3302 SelectionDAG &DAG) { 3303 switch(Opc) { 3304 default: llvm_unreachable("Unknown x86 shuffle node"); 3305 case X86ISD::PSHUFD: 3306 case X86ISD::PSHUFHW: 3307 case X86ISD::PSHUFLW: 3308 case X86ISD::VPERMILP: 3309 case X86ISD::VPERMI: 3310 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 3311 } 3312} 3313 3314static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3315 SDValue V1, SDValue V2, unsigned TargetMask, 3316 SelectionDAG &DAG) { 3317 switch(Opc) { 3318 default: llvm_unreachable("Unknown x86 shuffle node"); 3319 case X86ISD::PALIGNR: 3320 case X86ISD::SHUFP: 3321 case X86ISD::VPERM2X128: 3322 return DAG.getNode(Opc, dl, VT, V1, V2, 3323 DAG.getConstant(TargetMask, MVT::i8)); 3324 } 3325} 3326 3327static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 3328 SDValue V1, SDValue V2, SelectionDAG &DAG) { 3329 switch(Opc) { 3330 default: llvm_unreachable("Unknown x86 shuffle node"); 3331 case X86ISD::MOVLHPS: 3332 case X86ISD::MOVLHPD: 3333 case X86ISD::MOVHLPS: 3334 case X86ISD::MOVLPS: 3335 case X86ISD::MOVLPD: 3336 case X86ISD::MOVSS: 3337 case X86ISD::MOVSD: 3338 case X86ISD::UNPCKL: 3339 case X86ISD::UNPCKH: 3340 return DAG.getNode(Opc, dl, VT, V1, V2); 3341 } 3342} 3343 3344SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 3345 MachineFunction &MF = DAG.getMachineFunction(); 3346 const X86RegisterInfo *RegInfo = 3347 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 3348 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 3349 int ReturnAddrIndex = FuncInfo->getRAIndex(); 3350 3351 if (ReturnAddrIndex == 0) { 3352 // Set up a frame object for the return address. 3353 unsigned SlotSize = RegInfo->getSlotSize(); 3354 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, 3355 -(int64_t)SlotSize, 3356 false); 3357 FuncInfo->setRAIndex(ReturnAddrIndex); 3358 } 3359 3360 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 3361} 3362 3363bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 3364 bool hasSymbolicDisplacement) { 3365 // Offset should fit into 32 bit immediate field. 3366 if (!isInt<32>(Offset)) 3367 return false; 3368 3369 // If we don't have a symbolic displacement - we don't have any extra 3370 // restrictions. 3371 if (!hasSymbolicDisplacement) 3372 return true; 3373 3374 // FIXME: Some tweaks might be needed for medium code model. 3375 if (M != CodeModel::Small && M != CodeModel::Kernel) 3376 return false; 3377 3378 // For small code model we assume that latest object is 16MB before end of 31 3379 // bits boundary. We may also accept pretty large negative constants knowing 3380 // that all objects are in the positive half of address space. 3381 if (M == CodeModel::Small && Offset < 16*1024*1024) 3382 return true; 3383 3384 // For kernel code model we know that all object resist in the negative half 3385 // of 32bits address space. We may not accept negative offsets, since they may 3386 // be just off and we may accept pretty large positive ones. 3387 if (M == CodeModel::Kernel && Offset > 0) 3388 return true; 3389 3390 return false; 3391} 3392 3393/// isCalleePop - Determines whether the callee is required to pop its 3394/// own arguments. Callee pop is necessary to support tail calls. 3395bool X86::isCalleePop(CallingConv::ID CallingConv, 3396 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 3397 if (IsVarArg) 3398 return false; 3399 3400 switch (CallingConv) { 3401 default: 3402 return false; 3403 case CallingConv::X86_StdCall: 3404 return !is64Bit; 3405 case CallingConv::X86_FastCall: 3406 return !is64Bit; 3407 case CallingConv::X86_ThisCall: 3408 return !is64Bit; 3409 case CallingConv::Fast: 3410 return TailCallOpt; 3411 case CallingConv::GHC: 3412 return TailCallOpt; 3413 case CallingConv::HiPE: 3414 return TailCallOpt; 3415 } 3416} 3417 3418/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 3419/// specific condition code, returning the condition code and the LHS/RHS of the 3420/// comparison to make. 3421static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 3422 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 3423 if (!isFP) { 3424 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3425 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3426 // X > -1 -> X == 0, jump !sign. 3427 RHS = DAG.getConstant(0, RHS.getValueType()); 3428 return X86::COND_NS; 3429 } 3430 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3431 // X < 0 -> X == 0, jump on sign. 3432 return X86::COND_S; 3433 } 3434 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3435 // X < 1 -> X <= 0 3436 RHS = DAG.getConstant(0, RHS.getValueType()); 3437 return X86::COND_LE; 3438 } 3439 } 3440 3441 switch (SetCCOpcode) { 3442 default: llvm_unreachable("Invalid integer condition!"); 3443 case ISD::SETEQ: return X86::COND_E; 3444 case ISD::SETGT: return X86::COND_G; 3445 case ISD::SETGE: return X86::COND_GE; 3446 case ISD::SETLT: return X86::COND_L; 3447 case ISD::SETLE: return X86::COND_LE; 3448 case ISD::SETNE: return X86::COND_NE; 3449 case ISD::SETULT: return X86::COND_B; 3450 case ISD::SETUGT: return X86::COND_A; 3451 case ISD::SETULE: return X86::COND_BE; 3452 case ISD::SETUGE: return X86::COND_AE; 3453 } 3454 } 3455 3456 // First determine if it is required or is profitable to flip the operands. 3457 3458 // If LHS is a foldable load, but RHS is not, flip the condition. 3459 if (ISD::isNON_EXTLoad(LHS.getNode()) && 3460 !ISD::isNON_EXTLoad(RHS.getNode())) { 3461 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 3462 std::swap(LHS, RHS); 3463 } 3464 3465 switch (SetCCOpcode) { 3466 default: break; 3467 case ISD::SETOLT: 3468 case ISD::SETOLE: 3469 case ISD::SETUGT: 3470 case ISD::SETUGE: 3471 std::swap(LHS, RHS); 3472 break; 3473 } 3474 3475 // On a floating point condition, the flags are set as follows: 3476 // ZF PF CF op 3477 // 0 | 0 | 0 | X > Y 3478 // 0 | 0 | 1 | X < Y 3479 // 1 | 0 | 0 | X == Y 3480 // 1 | 1 | 1 | unordered 3481 switch (SetCCOpcode) { 3482 default: llvm_unreachable("Condcode should be pre-legalized away"); 3483 case ISD::SETUEQ: 3484 case ISD::SETEQ: return X86::COND_E; 3485 case ISD::SETOLT: // flipped 3486 case ISD::SETOGT: 3487 case ISD::SETGT: return X86::COND_A; 3488 case ISD::SETOLE: // flipped 3489 case ISD::SETOGE: 3490 case ISD::SETGE: return X86::COND_AE; 3491 case ISD::SETUGT: // flipped 3492 case ISD::SETULT: 3493 case ISD::SETLT: return X86::COND_B; 3494 case ISD::SETUGE: // flipped 3495 case ISD::SETULE: 3496 case ISD::SETLE: return X86::COND_BE; 3497 case ISD::SETONE: 3498 case ISD::SETNE: return X86::COND_NE; 3499 case ISD::SETUO: return X86::COND_P; 3500 case ISD::SETO: return X86::COND_NP; 3501 case ISD::SETOEQ: 3502 case ISD::SETUNE: return X86::COND_INVALID; 3503 } 3504} 3505 3506/// hasFPCMov - is there a floating point cmov for the specific X86 condition 3507/// code. Current x86 isa includes the following FP cmov instructions: 3508/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3509static bool hasFPCMov(unsigned X86CC) { 3510 switch (X86CC) { 3511 default: 3512 return false; 3513 case X86::COND_B: 3514 case X86::COND_BE: 3515 case X86::COND_E: 3516 case X86::COND_P: 3517 case X86::COND_A: 3518 case X86::COND_AE: 3519 case X86::COND_NE: 3520 case X86::COND_NP: 3521 return true; 3522 } 3523} 3524 3525/// isFPImmLegal - Returns true if the target can instruction select the 3526/// specified FP immediate natively. If false, the legalizer will 3527/// materialize the FP immediate as a load from a constant pool. 3528bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3529 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3530 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3531 return true; 3532 } 3533 return false; 3534} 3535 3536/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3537/// the specified range (L, H]. 3538static bool isUndefOrInRange(int Val, int Low, int Hi) { 3539 return (Val < 0) || (Val >= Low && Val < Hi); 3540} 3541 3542/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3543/// specified value. 3544static bool isUndefOrEqual(int Val, int CmpVal) { 3545 return (Val < 0 || Val == CmpVal); 3546} 3547 3548/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning 3549/// from position Pos and ending in Pos+Size, falls within the specified 3550/// sequential range (L, L+Pos]. or is undef. 3551static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 3552 unsigned Pos, unsigned Size, int Low) { 3553 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3554 if (!isUndefOrEqual(Mask[i], Low)) 3555 return false; 3556 return true; 3557} 3558 3559/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3560/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3561/// the second operand. 3562static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) { 3563 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3564 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3565 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3566 return (Mask[0] < 2 && Mask[1] < 2); 3567 return false; 3568} 3569 3570/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3571/// is suitable for input to PSHUFHW. 3572static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 3573 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 3574 return false; 3575 3576 // Lower quadword copied in order or undef. 3577 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) 3578 return false; 3579 3580 // Upper quadword shuffled. 3581 for (unsigned i = 4; i != 8; ++i) 3582 if (!isUndefOrInRange(Mask[i], 4, 8)) 3583 return false; 3584 3585 if (VT == MVT::v16i16) { 3586 // Lower quadword copied in order or undef. 3587 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) 3588 return false; 3589 3590 // Upper quadword shuffled. 3591 for (unsigned i = 12; i != 16; ++i) 3592 if (!isUndefOrInRange(Mask[i], 12, 16)) 3593 return false; 3594 } 3595 3596 return true; 3597} 3598 3599/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3600/// is suitable for input to PSHUFLW. 3601static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 3602 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 3603 return false; 3604 3605 // Upper quadword copied in order. 3606 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) 3607 return false; 3608 3609 // Lower quadword shuffled. 3610 for (unsigned i = 0; i != 4; ++i) 3611 if (!isUndefOrInRange(Mask[i], 0, 4)) 3612 return false; 3613 3614 if (VT == MVT::v16i16) { 3615 // Upper quadword copied in order. 3616 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) 3617 return false; 3618 3619 // Lower quadword shuffled. 3620 for (unsigned i = 8; i != 12; ++i) 3621 if (!isUndefOrInRange(Mask[i], 8, 12)) 3622 return false; 3623 } 3624 3625 return true; 3626} 3627 3628/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3629/// is suitable for input to PALIGNR. 3630static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, 3631 const X86Subtarget *Subtarget) { 3632 if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || 3633 (VT.is256BitVector() && !Subtarget->hasInt256())) 3634 return false; 3635 3636 unsigned NumElts = VT.getVectorNumElements(); 3637 unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128; 3638 unsigned NumLaneElts = NumElts/NumLanes; 3639 3640 // Do not handle 64-bit element shuffles with palignr. 3641 if (NumLaneElts == 2) 3642 return false; 3643 3644 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { 3645 unsigned i; 3646 for (i = 0; i != NumLaneElts; ++i) { 3647 if (Mask[i+l] >= 0) 3648 break; 3649 } 3650 3651 // Lane is all undef, go to next lane 3652 if (i == NumLaneElts) 3653 continue; 3654 3655 int Start = Mask[i+l]; 3656 3657 // Make sure its in this lane in one of the sources 3658 if (!isUndefOrInRange(Start, l, l+NumLaneElts) && 3659 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) 3660 return false; 3661 3662 // If not lane 0, then we must match lane 0 3663 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) 3664 return false; 3665 3666 // Correct second source to be contiguous with first source 3667 if (Start >= (int)NumElts) 3668 Start -= NumElts - NumLaneElts; 3669 3670 // Make sure we're shifting in the right direction. 3671 if (Start <= (int)(i+l)) 3672 return false; 3673 3674 Start -= i; 3675 3676 // Check the rest of the elements to see if they are consecutive. 3677 for (++i; i != NumLaneElts; ++i) { 3678 int Idx = Mask[i+l]; 3679 3680 // Make sure its in this lane 3681 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && 3682 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) 3683 return false; 3684 3685 // If not lane 0, then we must match lane 0 3686 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) 3687 return false; 3688 3689 if (Idx >= (int)NumElts) 3690 Idx -= NumElts - NumLaneElts; 3691 3692 if (!isUndefOrEqual(Idx, Start+i)) 3693 return false; 3694 3695 } 3696 } 3697 3698 return true; 3699} 3700 3701/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3702/// the two vector operands have swapped position. 3703static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, 3704 unsigned NumElems) { 3705 for (unsigned i = 0; i != NumElems; ++i) { 3706 int idx = Mask[i]; 3707 if (idx < 0) 3708 continue; 3709 else if (idx < (int)NumElems) 3710 Mask[i] = idx + NumElems; 3711 else 3712 Mask[i] = idx - NumElems; 3713 } 3714} 3715 3716/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3717/// specifies a shuffle of elements that is suitable for input to 128/256-bit 3718/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be 3719/// reverse of what x86 shuffles want. 3720static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) { 3721 3722 unsigned NumElems = VT.getVectorNumElements(); 3723 unsigned NumLanes = VT.getSizeInBits()/128; 3724 unsigned NumLaneElems = NumElems/NumLanes; 3725 3726 if (NumLaneElems != 2 && NumLaneElems != 4) 3727 return false; 3728 3729 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 3730 bool symetricMaskRequired = 3731 (VT.getSizeInBits() >= 256) && (EltSize == 32); 3732 3733 // VSHUFPSY divides the resulting vector into 4 chunks. 3734 // The sources are also splitted into 4 chunks, and each destination 3735 // chunk must come from a different source chunk. 3736 // 3737 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 3738 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 3739 // 3740 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 3741 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 3742 // 3743 // VSHUFPDY divides the resulting vector into 4 chunks. 3744 // The sources are also splitted into 4 chunks, and each destination 3745 // chunk must come from a different source chunk. 3746 // 3747 // SRC1 => X3 X2 X1 X0 3748 // SRC2 => Y3 Y2 Y1 Y0 3749 // 3750 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 3751 // 3752 SmallVector<int, 4> MaskVal(NumLaneElems, -1); 3753 unsigned HalfLaneElems = NumLaneElems/2; 3754 for (unsigned l = 0; l != NumElems; l += NumLaneElems) { 3755 for (unsigned i = 0; i != NumLaneElems; ++i) { 3756 int Idx = Mask[i+l]; 3757 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); 3758 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) 3759 return false; 3760 // For VSHUFPSY, the mask of the second half must be the same as the 3761 // first but with the appropriate offsets. This works in the same way as 3762 // VPERMILPS works with masks. 3763 if (!symetricMaskRequired || Idx < 0) 3764 continue; 3765 if (MaskVal[i] < 0) { 3766 MaskVal[i] = Idx - l; 3767 continue; 3768 } 3769 if ((signed)(Idx - l) != MaskVal[i]) 3770 return false; 3771 } 3772 } 3773 3774 return true; 3775} 3776 3777/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3778/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3779static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) { 3780 if (!VT.is128BitVector()) 3781 return false; 3782 3783 unsigned NumElems = VT.getVectorNumElements(); 3784 3785 if (NumElems != 4) 3786 return false; 3787 3788 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3789 return isUndefOrEqual(Mask[0], 6) && 3790 isUndefOrEqual(Mask[1], 7) && 3791 isUndefOrEqual(Mask[2], 2) && 3792 isUndefOrEqual(Mask[3], 3); 3793} 3794 3795/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3796/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3797/// <2, 3, 2, 3> 3798static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) { 3799 if (!VT.is128BitVector()) 3800 return false; 3801 3802 unsigned NumElems = VT.getVectorNumElements(); 3803 3804 if (NumElems != 4) 3805 return false; 3806 3807 return isUndefOrEqual(Mask[0], 2) && 3808 isUndefOrEqual(Mask[1], 3) && 3809 isUndefOrEqual(Mask[2], 2) && 3810 isUndefOrEqual(Mask[3], 3); 3811} 3812 3813/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3814/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3815static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) { 3816 if (!VT.is128BitVector()) 3817 return false; 3818 3819 unsigned NumElems = VT.getVectorNumElements(); 3820 3821 if (NumElems != 2 && NumElems != 4) 3822 return false; 3823 3824 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3825 if (!isUndefOrEqual(Mask[i], i + NumElems)) 3826 return false; 3827 3828 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 3829 if (!isUndefOrEqual(Mask[i], i)) 3830 return false; 3831 3832 return true; 3833} 3834 3835/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3836/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3837static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) { 3838 if (!VT.is128BitVector()) 3839 return false; 3840 3841 unsigned NumElems = VT.getVectorNumElements(); 3842 3843 if (NumElems != 2 && NumElems != 4) 3844 return false; 3845 3846 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3847 if (!isUndefOrEqual(Mask[i], i)) 3848 return false; 3849 3850 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3851 if (!isUndefOrEqual(Mask[i + e], i + NumElems)) 3852 return false; 3853 3854 return true; 3855} 3856 3857// 3858// Some special combinations that can be optimized. 3859// 3860static 3861SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, 3862 SelectionDAG &DAG) { 3863 MVT VT = SVOp->getSimpleValueType(0); 3864 SDLoc dl(SVOp); 3865 3866 if (VT != MVT::v8i32 && VT != MVT::v8f32) 3867 return SDValue(); 3868 3869 ArrayRef<int> Mask = SVOp->getMask(); 3870 3871 // These are the special masks that may be optimized. 3872 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; 3873 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; 3874 bool MatchEvenMask = true; 3875 bool MatchOddMask = true; 3876 for (int i=0; i<8; ++i) { 3877 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) 3878 MatchEvenMask = false; 3879 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) 3880 MatchOddMask = false; 3881 } 3882 3883 if (!MatchEvenMask && !MatchOddMask) 3884 return SDValue(); 3885 3886 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); 3887 3888 SDValue Op0 = SVOp->getOperand(0); 3889 SDValue Op1 = SVOp->getOperand(1); 3890 3891 if (MatchEvenMask) { 3892 // Shift the second operand right to 32 bits. 3893 static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; 3894 Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); 3895 } else { 3896 // Shift the first operand left to 32 bits. 3897 static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; 3898 Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); 3899 } 3900 static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; 3901 return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); 3902} 3903 3904/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3905/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3906static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, 3907 bool HasInt256, bool V2IsSplat = false) { 3908 3909 assert(VT.getSizeInBits() >= 128 && 3910 "Unsupported vector type for unpckl"); 3911 3912 // AVX defines UNPCK* to operate independently on 128-bit lanes. 3913 unsigned NumLanes; 3914 unsigned NumOf256BitLanes; 3915 unsigned NumElts = VT.getVectorNumElements(); 3916 if (VT.is256BitVector()) { 3917 if (NumElts != 4 && NumElts != 8 && 3918 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3919 return false; 3920 NumLanes = 2; 3921 NumOf256BitLanes = 1; 3922 } else if (VT.is512BitVector()) { 3923 assert(VT.getScalarType().getSizeInBits() >= 32 && 3924 "Unsupported vector type for unpckh"); 3925 NumLanes = 2; 3926 NumOf256BitLanes = 2; 3927 } else { 3928 NumLanes = 1; 3929 NumOf256BitLanes = 1; 3930 } 3931 3932 unsigned NumEltsInStride = NumElts/NumOf256BitLanes; 3933 unsigned NumLaneElts = NumEltsInStride/NumLanes; 3934 3935 for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { 3936 for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { 3937 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { 3938 int BitI = Mask[l256*NumEltsInStride+l+i]; 3939 int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; 3940 if (!isUndefOrEqual(BitI, j+l256*NumElts)) 3941 return false; 3942 if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) 3943 return false; 3944 if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) 3945 return false; 3946 } 3947 } 3948 } 3949 return true; 3950} 3951 3952/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3953/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3954static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, 3955 bool HasInt256, bool V2IsSplat = false) { 3956 assert(VT.getSizeInBits() >= 128 && 3957 "Unsupported vector type for unpckh"); 3958 3959 // AVX defines UNPCK* to operate independently on 128-bit lanes. 3960 unsigned NumLanes; 3961 unsigned NumOf256BitLanes; 3962 unsigned NumElts = VT.getVectorNumElements(); 3963 if (VT.is256BitVector()) { 3964 if (NumElts != 4 && NumElts != 8 && 3965 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3966 return false; 3967 NumLanes = 2; 3968 NumOf256BitLanes = 1; 3969 } else if (VT.is512BitVector()) { 3970 assert(VT.getScalarType().getSizeInBits() >= 32 && 3971 "Unsupported vector type for unpckh"); 3972 NumLanes = 2; 3973 NumOf256BitLanes = 2; 3974 } else { 3975 NumLanes = 1; 3976 NumOf256BitLanes = 1; 3977 } 3978 3979 unsigned NumEltsInStride = NumElts/NumOf256BitLanes; 3980 unsigned NumLaneElts = NumEltsInStride/NumLanes; 3981 3982 for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { 3983 for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { 3984 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { 3985 int BitI = Mask[l256*NumEltsInStride+l+i]; 3986 int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; 3987 if (!isUndefOrEqual(BitI, j+l256*NumElts)) 3988 return false; 3989 if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) 3990 return false; 3991 if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) 3992 return false; 3993 } 3994 } 3995 } 3996 return true; 3997} 3998 3999/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 4000/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 4001/// <0, 0, 1, 1> 4002static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 4003 unsigned NumElts = VT.getVectorNumElements(); 4004 bool Is256BitVec = VT.is256BitVector(); 4005 4006 if (VT.is512BitVector()) 4007 return false; 4008 assert((VT.is128BitVector() || VT.is256BitVector()) && 4009 "Unsupported vector type for unpckh"); 4010 4011 if (Is256BitVec && NumElts != 4 && NumElts != 8 && 4012 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 4013 return false; 4014 4015 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 4016 // FIXME: Need a better way to get rid of this, there's no latency difference 4017 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 4018 // the former later. We should also remove the "_undef" special mask. 4019 if (NumElts == 4 && Is256BitVec) 4020 return false; 4021 4022 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 4023 // independently on 128-bit lanes. 4024 unsigned NumLanes = VT.getSizeInBits()/128; 4025 unsigned NumLaneElts = NumElts/NumLanes; 4026 4027 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 4028 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { 4029 int BitI = Mask[l+i]; 4030 int BitI1 = Mask[l+i+1]; 4031 4032 if (!isUndefOrEqual(BitI, j)) 4033 return false; 4034 if (!isUndefOrEqual(BitI1, j)) 4035 return false; 4036 } 4037 } 4038 4039 return true; 4040} 4041 4042/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 4043/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 4044/// <2, 2, 3, 3> 4045static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 4046 unsigned NumElts = VT.getVectorNumElements(); 4047 4048 if (VT.is512BitVector()) 4049 return false; 4050 4051 assert((VT.is128BitVector() || VT.is256BitVector()) && 4052 "Unsupported vector type for unpckh"); 4053 4054 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && 4055 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 4056 return false; 4057 4058 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 4059 // independently on 128-bit lanes. 4060 unsigned NumLanes = VT.getSizeInBits()/128; 4061 unsigned NumLaneElts = NumElts/NumLanes; 4062 4063 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 4064 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { 4065 int BitI = Mask[l+i]; 4066 int BitI1 = Mask[l+i+1]; 4067 if (!isUndefOrEqual(BitI, j)) 4068 return false; 4069 if (!isUndefOrEqual(BitI1, j)) 4070 return false; 4071 } 4072 } 4073 return true; 4074} 4075 4076/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 4077/// specifies a shuffle of elements that is suitable for input to MOVSS, 4078/// MOVSD, and MOVD, i.e. setting the lowest element. 4079static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { 4080 if (VT.getVectorElementType().getSizeInBits() < 32) 4081 return false; 4082 if (!VT.is128BitVector()) 4083 return false; 4084 4085 unsigned NumElts = VT.getVectorNumElements(); 4086 4087 if (!isUndefOrEqual(Mask[0], NumElts)) 4088 return false; 4089 4090 for (unsigned i = 1; i != NumElts; ++i) 4091 if (!isUndefOrEqual(Mask[i], i)) 4092 return false; 4093 4094 return true; 4095} 4096 4097/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered 4098/// as permutations between 128-bit chunks or halves. As an example: this 4099/// shuffle bellow: 4100/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 4101/// The first half comes from the second half of V1 and the second half from the 4102/// the second half of V2. 4103static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { 4104 if (!HasFp256 || !VT.is256BitVector()) 4105 return false; 4106 4107 // The shuffle result is divided into half A and half B. In total the two 4108 // sources have 4 halves, namely: C, D, E, F. The final values of A and 4109 // B must come from C, D, E or F. 4110 unsigned HalfSize = VT.getVectorNumElements()/2; 4111 bool MatchA = false, MatchB = false; 4112 4113 // Check if A comes from one of C, D, E, F. 4114 for (unsigned Half = 0; Half != 4; ++Half) { 4115 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 4116 MatchA = true; 4117 break; 4118 } 4119 } 4120 4121 // Check if B comes from one of C, D, E, F. 4122 for (unsigned Half = 0; Half != 4; ++Half) { 4123 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 4124 MatchB = true; 4125 break; 4126 } 4127 } 4128 4129 return MatchA && MatchB; 4130} 4131 4132/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle 4133/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. 4134static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { 4135 MVT VT = SVOp->getSimpleValueType(0); 4136 4137 unsigned HalfSize = VT.getVectorNumElements()/2; 4138 4139 unsigned FstHalf = 0, SndHalf = 0; 4140 for (unsigned i = 0; i < HalfSize; ++i) { 4141 if (SVOp->getMaskElt(i) > 0) { 4142 FstHalf = SVOp->getMaskElt(i)/HalfSize; 4143 break; 4144 } 4145 } 4146 for (unsigned i = HalfSize; i < HalfSize*2; ++i) { 4147 if (SVOp->getMaskElt(i) > 0) { 4148 SndHalf = SVOp->getMaskElt(i)/HalfSize; 4149 break; 4150 } 4151 } 4152 4153 return (FstHalf | (SndHalf << 4)); 4154} 4155 4156// Symetric in-lane mask. Each lane has 4 elements (for imm8) 4157static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) { 4158 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 4159 if (EltSize < 32) 4160 return false; 4161 4162 unsigned NumElts = VT.getVectorNumElements(); 4163 Imm8 = 0; 4164 if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { 4165 for (unsigned i = 0; i != NumElts; ++i) { 4166 if (Mask[i] < 0) 4167 continue; 4168 Imm8 |= Mask[i] << (i*2); 4169 } 4170 return true; 4171 } 4172 4173 unsigned LaneSize = 4; 4174 SmallVector<int, 4> MaskVal(LaneSize, -1); 4175 4176 for (unsigned l = 0; l != NumElts; l += LaneSize) { 4177 for (unsigned i = 0; i != LaneSize; ++i) { 4178 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 4179 return false; 4180 if (Mask[i+l] < 0) 4181 continue; 4182 if (MaskVal[i] < 0) { 4183 MaskVal[i] = Mask[i+l] - l; 4184 Imm8 |= MaskVal[i] << (i*2); 4185 continue; 4186 } 4187 if (Mask[i+l] != (signed)(MaskVal[i]+l)) 4188 return false; 4189 } 4190 } 4191 return true; 4192} 4193 4194/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand 4195/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 4196/// Note that VPERMIL mask matching is different depending whether theunderlying 4197/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 4198/// to the same elements of the low, but to the higher half of the source. 4199/// In VPERMILPD the two lanes could be shuffled independently of each other 4200/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. 4201static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) { 4202 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 4203 if (VT.getSizeInBits() < 256 || EltSize < 32) 4204 return false; 4205 bool symetricMaskRequired = (EltSize == 32); 4206 unsigned NumElts = VT.getVectorNumElements(); 4207 4208 unsigned NumLanes = VT.getSizeInBits()/128; 4209 unsigned LaneSize = NumElts/NumLanes; 4210 // 2 or 4 elements in one lane 4211 4212 SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1); 4213 for (unsigned l = 0; l != NumElts; l += LaneSize) { 4214 for (unsigned i = 0; i != LaneSize; ++i) { 4215 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 4216 return false; 4217 if (symetricMaskRequired) { 4218 if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { 4219 ExpectedMaskVal[i] = Mask[i+l] - l; 4220 continue; 4221 } 4222 if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) 4223 return false; 4224 } 4225 } 4226 } 4227 return true; 4228} 4229 4230/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse 4231/// of what x86 movss want. X86 movs requires the lowest element to be lowest 4232/// element of vector 2 and the other elements to come from vector 1 in order. 4233static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT, 4234 bool V2IsSplat = false, bool V2IsUndef = false) { 4235 if (!VT.is128BitVector()) 4236 return false; 4237 4238 unsigned NumOps = VT.getVectorNumElements(); 4239 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 4240 return false; 4241 4242 if (!isUndefOrEqual(Mask[0], 0)) 4243 return false; 4244 4245 for (unsigned i = 1; i != NumOps; ++i) 4246 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 4247 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 4248 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 4249 return false; 4250 4251 return true; 4252} 4253 4254/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 4255/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 4256/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 4257static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT, 4258 const X86Subtarget *Subtarget) { 4259 if (!Subtarget->hasSSE3()) 4260 return false; 4261 4262 unsigned NumElems = VT.getVectorNumElements(); 4263 4264 if ((VT.is128BitVector() && NumElems != 4) || 4265 (VT.is256BitVector() && NumElems != 8) || 4266 (VT.is512BitVector() && NumElems != 16)) 4267 return false; 4268 4269 // "i+1" is the value the indexed mask element must have 4270 for (unsigned i = 0; i != NumElems; i += 2) 4271 if (!isUndefOrEqual(Mask[i], i+1) || 4272 !isUndefOrEqual(Mask[i+1], i+1)) 4273 return false; 4274 4275 return true; 4276} 4277 4278/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 4279/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 4280/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 4281static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT, 4282 const X86Subtarget *Subtarget) { 4283 if (!Subtarget->hasSSE3()) 4284 return false; 4285 4286 unsigned NumElems = VT.getVectorNumElements(); 4287 4288 if ((VT.is128BitVector() && NumElems != 4) || 4289 (VT.is256BitVector() && NumElems != 8) || 4290 (VT.is512BitVector() && NumElems != 16)) 4291 return false; 4292 4293 // "i" is the value the indexed mask element must have 4294 for (unsigned i = 0; i != NumElems; i += 2) 4295 if (!isUndefOrEqual(Mask[i], i) || 4296 !isUndefOrEqual(Mask[i+1], i)) 4297 return false; 4298 4299 return true; 4300} 4301 4302/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 4303/// specifies a shuffle of elements that is suitable for input to 256-bit 4304/// version of MOVDDUP. 4305static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { 4306 if (!HasFp256 || !VT.is256BitVector()) 4307 return false; 4308 4309 unsigned NumElts = VT.getVectorNumElements(); 4310 if (NumElts != 4) 4311 return false; 4312 4313 for (unsigned i = 0; i != NumElts/2; ++i) 4314 if (!isUndefOrEqual(Mask[i], 0)) 4315 return false; 4316 for (unsigned i = NumElts/2; i != NumElts; ++i) 4317 if (!isUndefOrEqual(Mask[i], NumElts/2)) 4318 return false; 4319 return true; 4320} 4321 4322/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 4323/// specifies a shuffle of elements that is suitable for input to 128-bit 4324/// version of MOVDDUP. 4325static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) { 4326 if (!VT.is128BitVector()) 4327 return false; 4328 4329 unsigned e = VT.getVectorNumElements() / 2; 4330 for (unsigned i = 0; i != e; ++i) 4331 if (!isUndefOrEqual(Mask[i], i)) 4332 return false; 4333 for (unsigned i = 0; i != e; ++i) 4334 if (!isUndefOrEqual(Mask[e+i], i)) 4335 return false; 4336 return true; 4337} 4338 4339/// isVEXTRACTIndex - Return true if the specified 4340/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 4341/// suitable for instruction that extract 128 or 256 bit vectors 4342static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { 4343 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 4344 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4345 return false; 4346 4347 // The index should be aligned on a vecWidth-bit boundary. 4348 uint64_t Index = 4349 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4350 4351 MVT VT = N->getSimpleValueType(0); 4352 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4353 bool Result = (Index * ElSize) % vecWidth == 0; 4354 4355 return Result; 4356} 4357 4358/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR 4359/// operand specifies a subvector insert that is suitable for input to 4360/// insertion of 128 or 256-bit subvectors 4361static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { 4362 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 4363 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4364 return false; 4365 // The index should be aligned on a vecWidth-bit boundary. 4366 uint64_t Index = 4367 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4368 4369 MVT VT = N->getSimpleValueType(0); 4370 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 4371 bool Result = (Index * ElSize) % vecWidth == 0; 4372 4373 return Result; 4374} 4375 4376bool X86::isVINSERT128Index(SDNode *N) { 4377 return isVINSERTIndex(N, 128); 4378} 4379 4380bool X86::isVINSERT256Index(SDNode *N) { 4381 return isVINSERTIndex(N, 256); 4382} 4383 4384bool X86::isVEXTRACT128Index(SDNode *N) { 4385 return isVEXTRACTIndex(N, 128); 4386} 4387 4388bool X86::isVEXTRACT256Index(SDNode *N) { 4389 return isVEXTRACTIndex(N, 256); 4390} 4391 4392/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 4393/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 4394/// Handles 128-bit and 256-bit. 4395static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { 4396 MVT VT = N->getSimpleValueType(0); 4397 4398 assert((VT.getSizeInBits() >= 128) && 4399 "Unsupported vector type for PSHUF/SHUFP"); 4400 4401 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate 4402 // independently on 128-bit lanes. 4403 unsigned NumElts = VT.getVectorNumElements(); 4404 unsigned NumLanes = VT.getSizeInBits()/128; 4405 unsigned NumLaneElts = NumElts/NumLanes; 4406 4407 assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && 4408 "Only supports 2, 4 or 8 elements per lane"); 4409 4410 unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; 4411 unsigned Mask = 0; 4412 for (unsigned i = 0; i != NumElts; ++i) { 4413 int Elt = N->getMaskElt(i); 4414 if (Elt < 0) continue; 4415 Elt &= NumLaneElts - 1; 4416 unsigned ShAmt = (i << Shift) % 8; 4417 Mask |= Elt << ShAmt; 4418 } 4419 4420 return Mask; 4421} 4422 4423/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 4424/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 4425static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { 4426 MVT VT = N->getSimpleValueType(0); 4427 4428 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4429 "Unsupported vector type for PSHUFHW"); 4430 4431 unsigned NumElts = VT.getVectorNumElements(); 4432 4433 unsigned Mask = 0; 4434 for (unsigned l = 0; l != NumElts; l += 8) { 4435 // 8 nodes per lane, but we only care about the last 4. 4436 for (unsigned i = 0; i < 4; ++i) { 4437 int Elt = N->getMaskElt(l+i+4); 4438 if (Elt < 0) continue; 4439 Elt &= 0x3; // only 2-bits. 4440 Mask |= Elt << (i * 2); 4441 } 4442 } 4443 4444 return Mask; 4445} 4446 4447/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 4448/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 4449static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { 4450 MVT VT = N->getSimpleValueType(0); 4451 4452 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4453 "Unsupported vector type for PSHUFHW"); 4454 4455 unsigned NumElts = VT.getVectorNumElements(); 4456 4457 unsigned Mask = 0; 4458 for (unsigned l = 0; l != NumElts; l += 8) { 4459 // 8 nodes per lane, but we only care about the first 4. 4460 for (unsigned i = 0; i < 4; ++i) { 4461 int Elt = N->getMaskElt(l+i); 4462 if (Elt < 0) continue; 4463 Elt &= 0x3; // only 2-bits 4464 Mask |= Elt << (i * 2); 4465 } 4466 } 4467 4468 return Mask; 4469} 4470 4471/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 4472/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 4473static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { 4474 MVT VT = SVOp->getSimpleValueType(0); 4475 unsigned EltSize = VT.is512BitVector() ? 1 : 4476 VT.getVectorElementType().getSizeInBits() >> 3; 4477 4478 unsigned NumElts = VT.getVectorNumElements(); 4479 unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; 4480 unsigned NumLaneElts = NumElts/NumLanes; 4481 4482 int Val = 0; 4483 unsigned i; 4484 for (i = 0; i != NumElts; ++i) { 4485 Val = SVOp->getMaskElt(i); 4486 if (Val >= 0) 4487 break; 4488 } 4489 if (Val >= (int)NumElts) 4490 Val -= NumElts - NumLaneElts; 4491 4492 assert(Val - i > 0 && "PALIGNR imm should be positive"); 4493 return (Val - i) * EltSize; 4494} 4495 4496static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { 4497 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 4498 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4499 llvm_unreachable("Illegal extract subvector for VEXTRACT"); 4500 4501 uint64_t Index = 4502 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4503 4504 MVT VecVT = N->getOperand(0).getSimpleValueType(); 4505 MVT ElVT = VecVT.getVectorElementType(); 4506 4507 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 4508 return Index / NumElemsPerChunk; 4509} 4510 4511static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { 4512 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 4513 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4514 llvm_unreachable("Illegal insert subvector for VINSERT"); 4515 4516 uint64_t Index = 4517 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4518 4519 MVT VecVT = N->getSimpleValueType(0); 4520 MVT ElVT = VecVT.getVectorElementType(); 4521 4522 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 4523 return Index / NumElemsPerChunk; 4524} 4525 4526/// getExtractVEXTRACT128Immediate - Return the appropriate immediate 4527/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 4528/// and VINSERTI128 instructions. 4529unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { 4530 return getExtractVEXTRACTImmediate(N, 128); 4531} 4532 4533/// getExtractVEXTRACT256Immediate - Return the appropriate immediate 4534/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 4535/// and VINSERTI64x4 instructions. 4536unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { 4537 return getExtractVEXTRACTImmediate(N, 256); 4538} 4539 4540/// getInsertVINSERT128Immediate - Return the appropriate immediate 4541/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 4542/// and VINSERTI128 instructions. 4543unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { 4544 return getInsertVINSERTImmediate(N, 128); 4545} 4546 4547/// getInsertVINSERT256Immediate - Return the appropriate immediate 4548/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 4549/// and VINSERTI64x4 instructions. 4550unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { 4551 return getInsertVINSERTImmediate(N, 256); 4552} 4553 4554/// isZeroNode - Returns true if Elt is a constant zero or a floating point 4555/// constant +0.0. 4556bool X86::isZeroNode(SDValue Elt) { 4557 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt)) 4558 return CN->isNullValue(); 4559 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) 4560 return CFP->getValueAPF().isPosZero(); 4561 return false; 4562} 4563 4564/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 4565/// their permute mask. 4566static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 4567 SelectionDAG &DAG) { 4568 MVT VT = SVOp->getSimpleValueType(0); 4569 unsigned NumElems = VT.getVectorNumElements(); 4570 SmallVector<int, 8> MaskVec; 4571 4572 for (unsigned i = 0; i != NumElems; ++i) { 4573 int Idx = SVOp->getMaskElt(i); 4574 if (Idx >= 0) { 4575 if (Idx < (int)NumElems) 4576 Idx += NumElems; 4577 else 4578 Idx -= NumElems; 4579 } 4580 MaskVec.push_back(Idx); 4581 } 4582 return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1), 4583 SVOp->getOperand(0), &MaskVec[0]); 4584} 4585 4586/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 4587/// match movhlps. The lower half elements should come from upper half of 4588/// V1 (and in order), and the upper half elements should come from the upper 4589/// half of V2 (and in order). 4590static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) { 4591 if (!VT.is128BitVector()) 4592 return false; 4593 if (VT.getVectorNumElements() != 4) 4594 return false; 4595 for (unsigned i = 0, e = 2; i != e; ++i) 4596 if (!isUndefOrEqual(Mask[i], i+2)) 4597 return false; 4598 for (unsigned i = 2; i != 4; ++i) 4599 if (!isUndefOrEqual(Mask[i], i+4)) 4600 return false; 4601 return true; 4602} 4603 4604/// isScalarLoadToVector - Returns true if the node is a scalar load that 4605/// is promoted to a vector. It also returns the LoadSDNode by reference if 4606/// required. 4607static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 4608 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 4609 return false; 4610 N = N->getOperand(0).getNode(); 4611 if (!ISD::isNON_EXTLoad(N)) 4612 return false; 4613 if (LD) 4614 *LD = cast<LoadSDNode>(N); 4615 return true; 4616} 4617 4618// Test whether the given value is a vector value which will be legalized 4619// into a load. 4620static bool WillBeConstantPoolLoad(SDNode *N) { 4621 if (N->getOpcode() != ISD::BUILD_VECTOR) 4622 return false; 4623 4624 // Check for any non-constant elements. 4625 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 4626 switch (N->getOperand(i).getNode()->getOpcode()) { 4627 case ISD::UNDEF: 4628 case ISD::ConstantFP: 4629 case ISD::Constant: 4630 break; 4631 default: 4632 return false; 4633 } 4634 4635 // Vectors of all-zeros and all-ones are materialized with special 4636 // instructions rather than being loaded. 4637 return !ISD::isBuildVectorAllZeros(N) && 4638 !ISD::isBuildVectorAllOnes(N); 4639} 4640 4641/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 4642/// match movlp{s|d}. The lower half elements should come from lower half of 4643/// V1 (and in order), and the upper half elements should come from the upper 4644/// half of V2 (and in order). And since V1 will become the source of the 4645/// MOVLP, it must be either a vector load or a scalar load to vector. 4646static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 4647 ArrayRef<int> Mask, MVT VT) { 4648 if (!VT.is128BitVector()) 4649 return false; 4650 4651 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 4652 return false; 4653 // Is V2 is a vector load, don't do this transformation. We will try to use 4654 // load folding shufps op. 4655 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) 4656 return false; 4657 4658 unsigned NumElems = VT.getVectorNumElements(); 4659 4660 if (NumElems != 2 && NumElems != 4) 4661 return false; 4662 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4663 if (!isUndefOrEqual(Mask[i], i)) 4664 return false; 4665 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 4666 if (!isUndefOrEqual(Mask[i], i+NumElems)) 4667 return false; 4668 return true; 4669} 4670 4671/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4672/// all the same. 4673static bool isSplatVector(SDNode *N) { 4674 if (N->getOpcode() != ISD::BUILD_VECTOR) 4675 return false; 4676 4677 SDValue SplatValue = N->getOperand(0); 4678 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4679 if (N->getOperand(i) != SplatValue) 4680 return false; 4681 return true; 4682} 4683 4684/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4685/// to an zero vector. 4686/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4687static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4688 SDValue V1 = N->getOperand(0); 4689 SDValue V2 = N->getOperand(1); 4690 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4691 for (unsigned i = 0; i != NumElems; ++i) { 4692 int Idx = N->getMaskElt(i); 4693 if (Idx >= (int)NumElems) { 4694 unsigned Opc = V2.getOpcode(); 4695 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4696 continue; 4697 if (Opc != ISD::BUILD_VECTOR || 4698 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4699 return false; 4700 } else if (Idx >= 0) { 4701 unsigned Opc = V1.getOpcode(); 4702 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4703 continue; 4704 if (Opc != ISD::BUILD_VECTOR || 4705 !X86::isZeroNode(V1.getOperand(Idx))) 4706 return false; 4707 } 4708 } 4709 return true; 4710} 4711 4712/// getZeroVector - Returns a vector of specified type with all zero elements. 4713/// 4714static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, 4715 SelectionDAG &DAG, SDLoc dl) { 4716 assert(VT.isVector() && "Expected a vector type"); 4717 4718 // Always build SSE zero vectors as <4 x i32> bitcasted 4719 // to their dest type. This ensures they get CSE'd. 4720 SDValue Vec; 4721 if (VT.is128BitVector()) { // SSE 4722 if (Subtarget->hasSSE2()) { // SSE2 4723 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4724 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4725 } else { // SSE1 4726 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4727 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4728 } 4729 } else if (VT.is256BitVector()) { // AVX 4730 if (Subtarget->hasInt256()) { // AVX2 4731 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4732 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4733 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 4734 array_lengthof(Ops)); 4735 } else { 4736 // 256-bit logic and arithmetic instructions in AVX are all 4737 // floating-point, no support for integer ops. Emit fp zeroed vectors. 4738 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4739 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4740 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 4741 array_lengthof(Ops)); 4742 } 4743 } else if (VT.is512BitVector()) { // AVX-512 4744 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4745 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 4746 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4747 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16); 4748 } else 4749 llvm_unreachable("Unexpected vector type"); 4750 4751 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4752} 4753 4754/// getOnesVector - Returns a vector of specified type with all bits set. 4755/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4756/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4757/// Then bitcast to their original type, ensuring they get CSE'd. 4758static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, 4759 SDLoc dl) { 4760 assert(VT.isVector() && "Expected a vector type"); 4761 4762 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4763 SDValue Vec; 4764 if (VT.is256BitVector()) { 4765 if (HasInt256) { // AVX2 4766 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4767 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 4768 array_lengthof(Ops)); 4769 } else { // AVX 4770 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4771 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 4772 } 4773 } else if (VT.is128BitVector()) { 4774 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4775 } else 4776 llvm_unreachable("Unexpected vector type"); 4777 4778 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4779} 4780 4781/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4782/// that point to V2 points to its first element. 4783static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { 4784 for (unsigned i = 0; i != NumElems; ++i) { 4785 if (Mask[i] > (int)NumElems) { 4786 Mask[i] = NumElems; 4787 } 4788 } 4789} 4790 4791/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4792/// operation of specified width. 4793static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, 4794 SDValue V2) { 4795 unsigned NumElems = VT.getVectorNumElements(); 4796 SmallVector<int, 8> Mask; 4797 Mask.push_back(NumElems); 4798 for (unsigned i = 1; i != NumElems; ++i) 4799 Mask.push_back(i); 4800 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4801} 4802 4803/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4804static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 4805 SDValue V2) { 4806 unsigned NumElems = VT.getVectorNumElements(); 4807 SmallVector<int, 8> Mask; 4808 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4809 Mask.push_back(i); 4810 Mask.push_back(i + NumElems); 4811 } 4812 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4813} 4814 4815/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4816static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 4817 SDValue V2) { 4818 unsigned NumElems = VT.getVectorNumElements(); 4819 SmallVector<int, 8> Mask; 4820 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 4821 Mask.push_back(i + Half); 4822 Mask.push_back(i + NumElems + Half); 4823 } 4824 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4825} 4826 4827// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4828// a generic shuffle instruction because the target has no such instructions. 4829// Generate shuffles which repeat i16 and i8 several times until they can be 4830// represented by v4f32 and then be manipulated by target suported shuffles. 4831static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4832 MVT VT = V.getSimpleValueType(); 4833 int NumElems = VT.getVectorNumElements(); 4834 SDLoc dl(V); 4835 4836 while (NumElems > 4) { 4837 if (EltNo < NumElems/2) { 4838 V = getUnpackl(DAG, dl, VT, V, V); 4839 } else { 4840 V = getUnpackh(DAG, dl, VT, V, V); 4841 EltNo -= NumElems/2; 4842 } 4843 NumElems >>= 1; 4844 } 4845 return V; 4846} 4847 4848/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4849static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4850 MVT VT = V.getSimpleValueType(); 4851 SDLoc dl(V); 4852 4853 if (VT.is128BitVector()) { 4854 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 4855 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4856 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 4857 &SplatMask[0]); 4858 } else if (VT.is256BitVector()) { 4859 // To use VPERMILPS to splat scalars, the second half of indicies must 4860 // refer to the higher part, which is a duplication of the lower one, 4861 // because VPERMILPS can only handle in-lane permutations. 4862 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4863 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4864 4865 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 4866 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 4867 &SplatMask[0]); 4868 } else 4869 llvm_unreachable("Vector size not supported"); 4870 4871 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4872} 4873 4874/// PromoteSplat - Splat is promoted to target supported vector shuffles. 4875static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4876 MVT SrcVT = SV->getSimpleValueType(0); 4877 SDValue V1 = SV->getOperand(0); 4878 SDLoc dl(SV); 4879 4880 int EltNo = SV->getSplatIndex(); 4881 int NumElems = SrcVT.getVectorNumElements(); 4882 bool Is256BitVec = SrcVT.is256BitVector(); 4883 4884 assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && 4885 "Unknown how to promote splat for type"); 4886 4887 // Extract the 128-bit part containing the splat element and update 4888 // the splat element index when it refers to the higher register. 4889 if (Is256BitVec) { 4890 V1 = Extract128BitVector(V1, EltNo, DAG, dl); 4891 if (EltNo >= NumElems/2) 4892 EltNo -= NumElems/2; 4893 } 4894 4895 // All i16 and i8 vector types can't be used directly by a generic shuffle 4896 // instruction because the target has no such instruction. Generate shuffles 4897 // which repeat i16 and i8 several times until they fit in i32, and then can 4898 // be manipulated by target suported shuffles. 4899 MVT EltVT = SrcVT.getVectorElementType(); 4900 if (EltVT == MVT::i8 || EltVT == MVT::i16) 4901 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4902 4903 // Recreate the 256-bit vector and place the same 128-bit vector 4904 // into the low and high part. This is necessary because we want 4905 // to use VPERM* to shuffle the vectors 4906 if (Is256BitVec) { 4907 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); 4908 } 4909 4910 return getLegalSplat(DAG, V1, EltNo); 4911} 4912 4913/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4914/// vector of zero or undef vector. This produces a shuffle where the low 4915/// element of V2 is swizzled into the zero/undef vector, landing at element 4916/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4917static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4918 bool IsZero, 4919 const X86Subtarget *Subtarget, 4920 SelectionDAG &DAG) { 4921 MVT VT = V2.getSimpleValueType(); 4922 SDValue V1 = IsZero 4923 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); 4924 unsigned NumElems = VT.getVectorNumElements(); 4925 SmallVector<int, 16> MaskVec; 4926 for (unsigned i = 0; i != NumElems; ++i) 4927 // If this is the insertion idx, put the low elt of V2 here. 4928 MaskVec.push_back(i == Idx ? NumElems : i); 4929 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); 4930} 4931 4932/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the 4933/// target specific opcode. Returns true if the Mask could be calculated. 4934/// Sets IsUnary to true if only uses one source. 4935static bool getTargetShuffleMask(SDNode *N, MVT VT, 4936 SmallVectorImpl<int> &Mask, bool &IsUnary) { 4937 unsigned NumElems = VT.getVectorNumElements(); 4938 SDValue ImmN; 4939 4940 IsUnary = false; 4941 switch(N->getOpcode()) { 4942 case X86ISD::SHUFP: 4943 ImmN = N->getOperand(N->getNumOperands()-1); 4944 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4945 break; 4946 case X86ISD::UNPCKH: 4947 DecodeUNPCKHMask(VT, Mask); 4948 break; 4949 case X86ISD::UNPCKL: 4950 DecodeUNPCKLMask(VT, Mask); 4951 break; 4952 case X86ISD::MOVHLPS: 4953 DecodeMOVHLPSMask(NumElems, Mask); 4954 break; 4955 case X86ISD::MOVLHPS: 4956 DecodeMOVLHPSMask(NumElems, Mask); 4957 break; 4958 case X86ISD::PALIGNR: 4959 ImmN = N->getOperand(N->getNumOperands()-1); 4960 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4961 break; 4962 case X86ISD::PSHUFD: 4963 case X86ISD::VPERMILP: 4964 ImmN = N->getOperand(N->getNumOperands()-1); 4965 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4966 IsUnary = true; 4967 break; 4968 case X86ISD::PSHUFHW: 4969 ImmN = N->getOperand(N->getNumOperands()-1); 4970 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4971 IsUnary = true; 4972 break; 4973 case X86ISD::PSHUFLW: 4974 ImmN = N->getOperand(N->getNumOperands()-1); 4975 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4976 IsUnary = true; 4977 break; 4978 case X86ISD::VPERMI: 4979 ImmN = N->getOperand(N->getNumOperands()-1); 4980 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4981 IsUnary = true; 4982 break; 4983 case X86ISD::MOVSS: 4984 case X86ISD::MOVSD: { 4985 // The index 0 always comes from the first element of the second source, 4986 // this is why MOVSS and MOVSD are used in the first place. The other 4987 // elements come from the other positions of the first source vector 4988 Mask.push_back(NumElems); 4989 for (unsigned i = 1; i != NumElems; ++i) { 4990 Mask.push_back(i); 4991 } 4992 break; 4993 } 4994 case X86ISD::VPERM2X128: 4995 ImmN = N->getOperand(N->getNumOperands()-1); 4996 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4997 if (Mask.empty()) return false; 4998 break; 4999 case X86ISD::MOVDDUP: 5000 case X86ISD::MOVLHPD: 5001 case X86ISD::MOVLPD: 5002 case X86ISD::MOVLPS: 5003 case X86ISD::MOVSHDUP: 5004 case X86ISD::MOVSLDUP: 5005 // Not yet implemented 5006 return false; 5007 default: llvm_unreachable("unknown target shuffle node"); 5008 } 5009 5010 return true; 5011} 5012 5013/// getShuffleScalarElt - Returns the scalar element that will make up the ith 5014/// element of the result of the vector shuffle. 5015static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 5016 unsigned Depth) { 5017 if (Depth == 6) 5018 return SDValue(); // Limit search depth. 5019 5020 SDValue V = SDValue(N, 0); 5021 EVT VT = V.getValueType(); 5022 unsigned Opcode = V.getOpcode(); 5023 5024 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 5025 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 5026 int Elt = SV->getMaskElt(Index); 5027 5028 if (Elt < 0) 5029 return DAG.getUNDEF(VT.getVectorElementType()); 5030 5031 unsigned NumElems = VT.getVectorNumElements(); 5032 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 5033 : SV->getOperand(1); 5034 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 5035 } 5036 5037 // Recurse into target specific vector shuffles to find scalars. 5038 if (isTargetShuffle(Opcode)) { 5039 MVT ShufVT = V.getSimpleValueType(); 5040 unsigned NumElems = ShufVT.getVectorNumElements(); 5041 SmallVector<int, 16> ShuffleMask; 5042 bool IsUnary; 5043 5044 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) 5045 return SDValue(); 5046 5047 int Elt = ShuffleMask[Index]; 5048 if (Elt < 0) 5049 return DAG.getUNDEF(ShufVT.getVectorElementType()); 5050 5051 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) 5052 : N->getOperand(1); 5053 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 5054 Depth+1); 5055 } 5056 5057 // Actual nodes that may contain scalar elements 5058 if (Opcode == ISD::BITCAST) { 5059 V = V.getOperand(0); 5060 EVT SrcVT = V.getValueType(); 5061 unsigned NumElems = VT.getVectorNumElements(); 5062 5063 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 5064 return SDValue(); 5065 } 5066 5067 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5068 return (Index == 0) ? V.getOperand(0) 5069 : DAG.getUNDEF(VT.getVectorElementType()); 5070 5071 if (V.getOpcode() == ISD::BUILD_VECTOR) 5072 return V.getOperand(Index); 5073 5074 return SDValue(); 5075} 5076 5077/// getNumOfConsecutiveZeros - Return the number of elements of a vector 5078/// shuffle operation which come from a consecutively from a zero. The 5079/// search can start in two different directions, from left or right. 5080/// We count undefs as zeros until PreferredNum is reached. 5081static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, 5082 unsigned NumElems, bool ZerosFromLeft, 5083 SelectionDAG &DAG, 5084 unsigned PreferredNum = -1U) { 5085 unsigned NumZeros = 0; 5086 for (unsigned i = 0; i != NumElems; ++i) { 5087 unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; 5088 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); 5089 if (!Elt.getNode()) 5090 break; 5091 5092 if (X86::isZeroNode(Elt)) 5093 ++NumZeros; 5094 else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. 5095 NumZeros = std::min(NumZeros + 1, PreferredNum); 5096 else 5097 break; 5098 } 5099 5100 return NumZeros; 5101} 5102 5103/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) 5104/// correspond consecutively to elements from one of the vector operands, 5105/// starting from its index OpIdx. Also tell OpNum which source vector operand. 5106static 5107bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, 5108 unsigned MaskI, unsigned MaskE, unsigned OpIdx, 5109 unsigned NumElems, unsigned &OpNum) { 5110 bool SeenV1 = false; 5111 bool SeenV2 = false; 5112 5113 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { 5114 int Idx = SVOp->getMaskElt(i); 5115 // Ignore undef indicies 5116 if (Idx < 0) 5117 continue; 5118 5119 if (Idx < (int)NumElems) 5120 SeenV1 = true; 5121 else 5122 SeenV2 = true; 5123 5124 // Only accept consecutive elements from the same vector 5125 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 5126 return false; 5127 } 5128 5129 OpNum = SeenV1 ? 0 : 1; 5130 return true; 5131} 5132 5133/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 5134/// logical left shift of a vector. 5135static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 5136 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 5137 unsigned NumElems = 5138 SVOp->getSimpleValueType(0).getVectorNumElements(); 5139 unsigned NumZeros = getNumOfConsecutiveZeros( 5140 SVOp, NumElems, false /* check zeros from right */, DAG, 5141 SVOp->getMaskElt(0)); 5142 unsigned OpSrc; 5143 5144 if (!NumZeros) 5145 return false; 5146 5147 // Considering the elements in the mask that are not consecutive zeros, 5148 // check if they consecutively come from only one of the source vectors. 5149 // 5150 // V1 = {X, A, B, C} 0 5151 // \ \ \ / 5152 // vector_shuffle V1, V2 <1, 2, 3, X> 5153 // 5154 if (!isShuffleMaskConsecutive(SVOp, 5155 0, // Mask Start Index 5156 NumElems-NumZeros, // Mask End Index(exclusive) 5157 NumZeros, // Where to start looking in the src vector 5158 NumElems, // Number of elements in vector 5159 OpSrc)) // Which source operand ? 5160 return false; 5161 5162 isLeft = false; 5163 ShAmt = NumZeros; 5164 ShVal = SVOp->getOperand(OpSrc); 5165 return true; 5166} 5167 5168/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 5169/// logical left shift of a vector. 5170static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 5171 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 5172 unsigned NumElems = 5173 SVOp->getSimpleValueType(0).getVectorNumElements(); 5174 unsigned NumZeros = getNumOfConsecutiveZeros( 5175 SVOp, NumElems, true /* check zeros from left */, DAG, 5176 NumElems - SVOp->getMaskElt(NumElems - 1) - 1); 5177 unsigned OpSrc; 5178 5179 if (!NumZeros) 5180 return false; 5181 5182 // Considering the elements in the mask that are not consecutive zeros, 5183 // check if they consecutively come from only one of the source vectors. 5184 // 5185 // 0 { A, B, X, X } = V2 5186 // / \ / / 5187 // vector_shuffle V1, V2 <X, X, 4, 5> 5188 // 5189 if (!isShuffleMaskConsecutive(SVOp, 5190 NumZeros, // Mask Start Index 5191 NumElems, // Mask End Index(exclusive) 5192 0, // Where to start looking in the src vector 5193 NumElems, // Number of elements in vector 5194 OpSrc)) // Which source operand ? 5195 return false; 5196 5197 isLeft = true; 5198 ShAmt = NumZeros; 5199 ShVal = SVOp->getOperand(OpSrc); 5200 return true; 5201} 5202 5203/// isVectorShift - Returns true if the shuffle can be implemented as a 5204/// logical left or right shift of a vector. 5205static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 5206 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 5207 // Although the logic below support any bitwidth size, there are no 5208 // shift instructions which handle more than 128-bit vectors. 5209 if (!SVOp->getSimpleValueType(0).is128BitVector()) 5210 return false; 5211 5212 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 5213 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 5214 return true; 5215 5216 return false; 5217} 5218 5219/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 5220/// 5221static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 5222 unsigned NumNonZero, unsigned NumZero, 5223 SelectionDAG &DAG, 5224 const X86Subtarget* Subtarget, 5225 const TargetLowering &TLI) { 5226 if (NumNonZero > 8) 5227 return SDValue(); 5228 5229 SDLoc dl(Op); 5230 SDValue V(0, 0); 5231 bool First = true; 5232 for (unsigned i = 0; i < 16; ++i) { 5233 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 5234 if (ThisIsNonZero && First) { 5235 if (NumZero) 5236 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 5237 else 5238 V = DAG.getUNDEF(MVT::v8i16); 5239 First = false; 5240 } 5241 5242 if ((i & 1) != 0) { 5243 SDValue ThisElt(0, 0), LastElt(0, 0); 5244 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 5245 if (LastIsNonZero) { 5246 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 5247 MVT::i16, Op.getOperand(i-1)); 5248 } 5249 if (ThisIsNonZero) { 5250 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 5251 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 5252 ThisElt, DAG.getConstant(8, MVT::i8)); 5253 if (LastIsNonZero) 5254 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 5255 } else 5256 ThisElt = LastElt; 5257 5258 if (ThisElt.getNode()) 5259 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 5260 DAG.getIntPtrConstant(i/2)); 5261 } 5262 } 5263 5264 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 5265} 5266 5267/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 5268/// 5269static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 5270 unsigned NumNonZero, unsigned NumZero, 5271 SelectionDAG &DAG, 5272 const X86Subtarget* Subtarget, 5273 const TargetLowering &TLI) { 5274 if (NumNonZero > 4) 5275 return SDValue(); 5276 5277 SDLoc dl(Op); 5278 SDValue V(0, 0); 5279 bool First = true; 5280 for (unsigned i = 0; i < 8; ++i) { 5281 bool isNonZero = (NonZeros & (1 << i)) != 0; 5282 if (isNonZero) { 5283 if (First) { 5284 if (NumZero) 5285 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 5286 else 5287 V = DAG.getUNDEF(MVT::v8i16); 5288 First = false; 5289 } 5290 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 5291 MVT::v8i16, V, Op.getOperand(i), 5292 DAG.getIntPtrConstant(i)); 5293 } 5294 } 5295 5296 return V; 5297} 5298 5299/// getVShift - Return a vector logical shift node. 5300/// 5301static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 5302 unsigned NumBits, SelectionDAG &DAG, 5303 const TargetLowering &TLI, SDLoc dl) { 5304 assert(VT.is128BitVector() && "Unknown type for VShift"); 5305 EVT ShVT = MVT::v2i64; 5306 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 5307 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 5308 return DAG.getNode(ISD::BITCAST, dl, VT, 5309 DAG.getNode(Opc, dl, ShVT, SrcOp, 5310 DAG.getConstant(NumBits, 5311 TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); 5312} 5313 5314static SDValue 5315LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { 5316 5317 // Check if the scalar load can be widened into a vector load. And if 5318 // the address is "base + cst" see if the cst can be "absorbed" into 5319 // the shuffle mask. 5320 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 5321 SDValue Ptr = LD->getBasePtr(); 5322 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 5323 return SDValue(); 5324 EVT PVT = LD->getValueType(0); 5325 if (PVT != MVT::i32 && PVT != MVT::f32) 5326 return SDValue(); 5327 5328 int FI = -1; 5329 int64_t Offset = 0; 5330 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 5331 FI = FINode->getIndex(); 5332 Offset = 0; 5333 } else if (DAG.isBaseWithConstantOffset(Ptr) && 5334 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 5335 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 5336 Offset = Ptr.getConstantOperandVal(1); 5337 Ptr = Ptr.getOperand(0); 5338 } else { 5339 return SDValue(); 5340 } 5341 5342 // FIXME: 256-bit vector instructions don't require a strict alignment, 5343 // improve this code to support it better. 5344 unsigned RequiredAlign = VT.getSizeInBits()/8; 5345 SDValue Chain = LD->getChain(); 5346 // Make sure the stack object alignment is at least 16 or 32. 5347 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5348 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 5349 if (MFI->isFixedObjectIndex(FI)) { 5350 // Can't change the alignment. FIXME: It's possible to compute 5351 // the exact stack offset and reference FI + adjust offset instead. 5352 // If someone *really* cares about this. That's the way to implement it. 5353 return SDValue(); 5354 } else { 5355 MFI->setObjectAlignment(FI, RequiredAlign); 5356 } 5357 } 5358 5359 // (Offset % 16 or 32) must be multiple of 4. Then address is then 5360 // Ptr + (Offset & ~15). 5361 if (Offset < 0) 5362 return SDValue(); 5363 if ((Offset % RequiredAlign) & 3) 5364 return SDValue(); 5365 int64_t StartOffset = Offset & ~(RequiredAlign-1); 5366 if (StartOffset) 5367 Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(), 5368 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 5369 5370 int EltNo = (Offset - StartOffset) >> 2; 5371 unsigned NumElems = VT.getVectorNumElements(); 5372 5373 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 5374 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 5375 LD->getPointerInfo().getWithOffset(StartOffset), 5376 false, false, false, 0); 5377 5378 SmallVector<int, 8> Mask; 5379 for (unsigned i = 0; i != NumElems; ++i) 5380 Mask.push_back(EltNo); 5381 5382 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 5383 } 5384 5385 return SDValue(); 5386} 5387 5388/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 5389/// vector of type 'VT', see if the elements can be replaced by a single large 5390/// load which has the same value as a build_vector whose operands are 'elts'. 5391/// 5392/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 5393/// 5394/// FIXME: we'd also like to handle the case where the last elements are zero 5395/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 5396/// There's even a handy isZeroNode for that purpose. 5397static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 5398 SDLoc &DL, SelectionDAG &DAG) { 5399 EVT EltVT = VT.getVectorElementType(); 5400 unsigned NumElems = Elts.size(); 5401 5402 LoadSDNode *LDBase = NULL; 5403 unsigned LastLoadedElt = -1U; 5404 5405 // For each element in the initializer, see if we've found a load or an undef. 5406 // If we don't find an initial load element, or later load elements are 5407 // non-consecutive, bail out. 5408 for (unsigned i = 0; i < NumElems; ++i) { 5409 SDValue Elt = Elts[i]; 5410 5411 if (!Elt.getNode() || 5412 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 5413 return SDValue(); 5414 if (!LDBase) { 5415 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 5416 return SDValue(); 5417 LDBase = cast<LoadSDNode>(Elt.getNode()); 5418 LastLoadedElt = i; 5419 continue; 5420 } 5421 if (Elt.getOpcode() == ISD::UNDEF) 5422 continue; 5423 5424 LoadSDNode *LD = cast<LoadSDNode>(Elt); 5425 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 5426 return SDValue(); 5427 LastLoadedElt = i; 5428 } 5429 5430 // If we have found an entire vector of loads and undefs, then return a large 5431 // load of the entire vector width starting at the base pointer. If we found 5432 // consecutive loads for the low half, generate a vzext_load node. 5433 if (LastLoadedElt == NumElems - 1) { 5434 SDValue NewLd = SDValue(); 5435 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 5436 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5437 LDBase->getPointerInfo(), 5438 LDBase->isVolatile(), LDBase->isNonTemporal(), 5439 LDBase->isInvariant(), 0); 5440 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5441 LDBase->getPointerInfo(), 5442 LDBase->isVolatile(), LDBase->isNonTemporal(), 5443 LDBase->isInvariant(), LDBase->getAlignment()); 5444 5445 if (LDBase->hasAnyUseOfValue(1)) { 5446 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5447 SDValue(LDBase, 1), 5448 SDValue(NewLd.getNode(), 1)); 5449 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5450 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5451 SDValue(NewLd.getNode(), 1)); 5452 } 5453 5454 return NewLd; 5455 } 5456 if (NumElems == 4 && LastLoadedElt == 1 && 5457 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 5458 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 5459 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 5460 SDValue ResNode = 5461 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 5462 array_lengthof(Ops), MVT::i64, 5463 LDBase->getPointerInfo(), 5464 LDBase->getAlignment(), 5465 false/*isVolatile*/, true/*ReadMem*/, 5466 false/*WriteMem*/); 5467 5468 // Make sure the newly-created LOAD is in the same position as LDBase in 5469 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and 5470 // update uses of LDBase's output chain to use the TokenFactor. 5471 if (LDBase->hasAnyUseOfValue(1)) { 5472 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5473 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); 5474 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5475 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5476 SDValue(ResNode.getNode(), 1)); 5477 } 5478 5479 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 5480 } 5481 return SDValue(); 5482} 5483 5484/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction 5485/// to generate a splat value for the following cases: 5486/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 5487/// 2. A splat shuffle which uses a scalar_to_vector node which comes from 5488/// a scalar load, or a constant. 5489/// The VBROADCAST node is returned when a pattern is found, 5490/// or SDValue() otherwise. 5491static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, 5492 SelectionDAG &DAG) { 5493 if (!Subtarget->hasFp256()) 5494 return SDValue(); 5495 5496 MVT VT = Op.getSimpleValueType(); 5497 SDLoc dl(Op); 5498 5499 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && 5500 "Unsupported vector type for broadcast."); 5501 5502 SDValue Ld; 5503 bool ConstSplatVal; 5504 5505 switch (Op.getOpcode()) { 5506 default: 5507 // Unknown pattern found. 5508 return SDValue(); 5509 5510 case ISD::BUILD_VECTOR: { 5511 // The BUILD_VECTOR node must be a splat. 5512 if (!isSplatVector(Op.getNode())) 5513 return SDValue(); 5514 5515 Ld = Op.getOperand(0); 5516 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5517 Ld.getOpcode() == ISD::ConstantFP); 5518 5519 // The suspected load node has several users. Make sure that all 5520 // of its users are from the BUILD_VECTOR node. 5521 // Constants may have multiple users. 5522 if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) 5523 return SDValue(); 5524 break; 5525 } 5526 5527 case ISD::VECTOR_SHUFFLE: { 5528 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5529 5530 // Shuffles must have a splat mask where the first element is 5531 // broadcasted. 5532 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 5533 return SDValue(); 5534 5535 SDValue Sc = Op.getOperand(0); 5536 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 5537 Sc.getOpcode() != ISD::BUILD_VECTOR) { 5538 5539 if (!Subtarget->hasInt256()) 5540 return SDValue(); 5541 5542 // Use the register form of the broadcast instruction available on AVX2. 5543 if (VT.getSizeInBits() >= 256) 5544 Sc = Extract128BitVector(Sc, 0, DAG, dl); 5545 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 5546 } 5547 5548 Ld = Sc.getOperand(0); 5549 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5550 Ld.getOpcode() == ISD::ConstantFP); 5551 5552 // The scalar_to_vector node and the suspected 5553 // load node must have exactly one user. 5554 // Constants may have multiple users. 5555 5556 // AVX-512 has register version of the broadcast 5557 bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && 5558 Ld.getValueType().getSizeInBits() >= 32; 5559 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && 5560 !hasRegVer)) 5561 return SDValue(); 5562 break; 5563 } 5564 } 5565 5566 bool IsGE256 = (VT.getSizeInBits() >= 256); 5567 5568 // Handle the broadcasting a single constant scalar from the constant pool 5569 // into a vector. On Sandybridge it is still better to load a constant vector 5570 // from the constant pool and not to broadcast it from a scalar. 5571 if (ConstSplatVal && Subtarget->hasInt256()) { 5572 EVT CVT = Ld.getValueType(); 5573 assert(!CVT.isVector() && "Must not broadcast a vector type"); 5574 unsigned ScalarSize = CVT.getSizeInBits(); 5575 5576 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { 5577 const Constant *C = 0; 5578 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 5579 C = CI->getConstantIntValue(); 5580 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 5581 C = CF->getConstantFPValue(); 5582 5583 assert(C && "Invalid constant type"); 5584 5585 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5586 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 5587 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 5588 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, 5589 MachinePointerInfo::getConstantPool(), 5590 false, false, false, Alignment); 5591 5592 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5593 } 5594 } 5595 5596 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 5597 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 5598 5599 // Handle AVX2 in-register broadcasts. 5600 if (!IsLoad && Subtarget->hasInt256() && 5601 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) 5602 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5603 5604 // The scalar source must be a normal load. 5605 if (!IsLoad) 5606 return SDValue(); 5607 5608 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) 5609 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5610 5611 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 5612 // double since there is no vbroadcastsd xmm 5613 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { 5614 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 5615 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5616 } 5617 5618 // Unsupported broadcast. 5619 return SDValue(); 5620} 5621 5622static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { 5623 MVT VT = Op.getSimpleValueType(); 5624 5625 // Skip if insert_vec_elt is not supported. 5626 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5627 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) 5628 return SDValue(); 5629 5630 SDLoc DL(Op); 5631 unsigned NumElems = Op.getNumOperands(); 5632 5633 SDValue VecIn1; 5634 SDValue VecIn2; 5635 SmallVector<unsigned, 4> InsertIndices; 5636 SmallVector<int, 8> Mask(NumElems, -1); 5637 5638 for (unsigned i = 0; i != NumElems; ++i) { 5639 unsigned Opc = Op.getOperand(i).getOpcode(); 5640 5641 if (Opc == ISD::UNDEF) 5642 continue; 5643 5644 if (Opc != ISD::EXTRACT_VECTOR_ELT) { 5645 // Quit if more than 1 elements need inserting. 5646 if (InsertIndices.size() > 1) 5647 return SDValue(); 5648 5649 InsertIndices.push_back(i); 5650 continue; 5651 } 5652 5653 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); 5654 SDValue ExtIdx = Op.getOperand(i).getOperand(1); 5655 5656 // Quit if extracted from vector of different type. 5657 if (ExtractedFromVec.getValueType() != VT) 5658 return SDValue(); 5659 5660 // Quit if non-constant index. 5661 if (!isa<ConstantSDNode>(ExtIdx)) 5662 return SDValue(); 5663 5664 if (VecIn1.getNode() == 0) 5665 VecIn1 = ExtractedFromVec; 5666 else if (VecIn1 != ExtractedFromVec) { 5667 if (VecIn2.getNode() == 0) 5668 VecIn2 = ExtractedFromVec; 5669 else if (VecIn2 != ExtractedFromVec) 5670 // Quit if more than 2 vectors to shuffle 5671 return SDValue(); 5672 } 5673 5674 unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); 5675 5676 if (ExtractedFromVec == VecIn1) 5677 Mask[i] = Idx; 5678 else if (ExtractedFromVec == VecIn2) 5679 Mask[i] = Idx + NumElems; 5680 } 5681 5682 if (VecIn1.getNode() == 0) 5683 return SDValue(); 5684 5685 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); 5686 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); 5687 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { 5688 unsigned Idx = InsertIndices[i]; 5689 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), 5690 DAG.getIntPtrConstant(Idx)); 5691 } 5692 5693 return NV; 5694} 5695 5696// Lower BUILD_VECTOR operation for v8i1 and v16i1 types. 5697SDValue 5698X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { 5699 5700 MVT VT = Op.getSimpleValueType(); 5701 assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && 5702 "Unexpected type in LowerBUILD_VECTORvXi1!"); 5703 5704 SDLoc dl(Op); 5705 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5706 SDValue Cst = DAG.getTargetConstant(0, MVT::i1); 5707 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 5708 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 5709 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, 5710 Ops, VT.getVectorNumElements()); 5711 } 5712 5713 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 5714 SDValue Cst = DAG.getTargetConstant(1, MVT::i1); 5715 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 5716 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 5717 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, 5718 Ops, VT.getVectorNumElements()); 5719 } 5720 5721 bool AllContants = true; 5722 uint64_t Immediate = 0; 5723 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { 5724 SDValue In = Op.getOperand(idx); 5725 if (In.getOpcode() == ISD::UNDEF) 5726 continue; 5727 if (!isa<ConstantSDNode>(In)) { 5728 AllContants = false; 5729 break; 5730 } 5731 if (cast<ConstantSDNode>(In)->getZExtValue()) 5732 Immediate |= (1ULL << idx); 5733 } 5734 5735 if (AllContants) { 5736 SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, 5737 DAG.getConstant(Immediate, MVT::i16)); 5738 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, 5739 DAG.getIntPtrConstant(0)); 5740 } 5741 5742 // Splat vector (with undefs) 5743 SDValue In = Op.getOperand(0); 5744 for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) { 5745 if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF) 5746 llvm_unreachable("Unsupported predicate operation"); 5747 } 5748 5749 SDValue EFLAGS, X86CC; 5750 if (In.getOpcode() == ISD::SETCC) { 5751 SDValue Op0 = In.getOperand(0); 5752 SDValue Op1 = In.getOperand(1); 5753 ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get(); 5754 bool isFP = Op1.getValueType().isFloatingPoint(); 5755 unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5756 5757 assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation"); 5758 5759 X86CC = DAG.getConstant(X86CCVal, MVT::i8); 5760 EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG); 5761 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 5762 } else if (In.getOpcode() == X86ISD::SETCC) { 5763 X86CC = In.getOperand(0); 5764 EFLAGS = In.getOperand(1); 5765 } else { 5766 // The algorithm: 5767 // Bit1 = In & 0x1 5768 // if (Bit1 != 0) 5769 // ZF = 0 5770 // else 5771 // ZF = 1 5772 // if (ZF == 0) 5773 // res = allOnes ### CMOVNE -1, %res 5774 // else 5775 // res = allZero 5776 MVT InVT = In.getSimpleValueType(); 5777 SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT)); 5778 EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG); 5779 X86CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5780 } 5781 5782 if (VT == MVT::v16i1) { 5783 SDValue Cst1 = DAG.getConstant(-1, MVT::i16); 5784 SDValue Cst0 = DAG.getConstant(0, MVT::i16); 5785 SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16, 5786 Cst0, Cst1, X86CC, EFLAGS); 5787 return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); 5788 } 5789 5790 if (VT == MVT::v8i1) { 5791 SDValue Cst1 = DAG.getConstant(-1, MVT::i32); 5792 SDValue Cst0 = DAG.getConstant(0, MVT::i32); 5793 SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32, 5794 Cst0, Cst1, X86CC, EFLAGS); 5795 CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp); 5796 return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); 5797 } 5798 llvm_unreachable("Unsupported predicate operation"); 5799} 5800 5801SDValue 5802X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5803 SDLoc dl(Op); 5804 5805 MVT VT = Op.getSimpleValueType(); 5806 MVT ExtVT = VT.getVectorElementType(); 5807 unsigned NumElems = Op.getNumOperands(); 5808 5809 // Generate vectors for predicate vectors. 5810 if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) 5811 return LowerBUILD_VECTORvXi1(Op, DAG); 5812 5813 // Vectors containing all zeros can be matched by pxor and xorps later 5814 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5815 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 5816 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 5817 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) 5818 return Op; 5819 5820 return getZeroVector(VT, Subtarget, DAG, dl); 5821 } 5822 5823 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 5824 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 5825 // vpcmpeqd on 256-bit vectors. 5826 if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { 5827 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) 5828 return Op; 5829 5830 if (!VT.is512BitVector()) 5831 return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); 5832 } 5833 5834 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); 5835 if (Broadcast.getNode()) 5836 return Broadcast; 5837 5838 unsigned EVTBits = ExtVT.getSizeInBits(); 5839 5840 unsigned NumZero = 0; 5841 unsigned NumNonZero = 0; 5842 unsigned NonZeros = 0; 5843 bool IsAllConstants = true; 5844 SmallSet<SDValue, 8> Values; 5845 for (unsigned i = 0; i < NumElems; ++i) { 5846 SDValue Elt = Op.getOperand(i); 5847 if (Elt.getOpcode() == ISD::UNDEF) 5848 continue; 5849 Values.insert(Elt); 5850 if (Elt.getOpcode() != ISD::Constant && 5851 Elt.getOpcode() != ISD::ConstantFP) 5852 IsAllConstants = false; 5853 if (X86::isZeroNode(Elt)) 5854 NumZero++; 5855 else { 5856 NonZeros |= (1 << i); 5857 NumNonZero++; 5858 } 5859 } 5860 5861 // All undef vector. Return an UNDEF. All zero vectors were handled above. 5862 if (NumNonZero == 0) 5863 return DAG.getUNDEF(VT); 5864 5865 // Special case for single non-zero, non-undef, element. 5866 if (NumNonZero == 1) { 5867 unsigned Idx = countTrailingZeros(NonZeros); 5868 SDValue Item = Op.getOperand(Idx); 5869 5870 // If this is an insertion of an i64 value on x86-32, and if the top bits of 5871 // the value are obviously zero, truncate the value to i32 and do the 5872 // insertion that way. Only do this if the value is non-constant or if the 5873 // value is a constant being inserted into element 0. It is cheaper to do 5874 // a constant pool load than it is to do a movd + shuffle. 5875 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5876 (!IsAllConstants || Idx == 0)) { 5877 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5878 // Handle SSE only. 5879 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5880 EVT VecVT = MVT::v4i32; 5881 unsigned VecElts = 4; 5882 5883 // Truncate the value (which may itself be a constant) to i32, and 5884 // convert it to a vector with movd (S2V+shuffle to zero extend). 5885 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5886 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5887 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5888 5889 // Now we have our 32-bit value zero extended in the low element of 5890 // a vector. If Idx != 0, swizzle it into place. 5891 if (Idx != 0) { 5892 SmallVector<int, 4> Mask; 5893 Mask.push_back(Idx); 5894 for (unsigned i = 1; i != VecElts; ++i) 5895 Mask.push_back(i); 5896 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), 5897 &Mask[0]); 5898 } 5899 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5900 } 5901 } 5902 5903 // If we have a constant or non-constant insertion into the low element of 5904 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5905 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5906 // depending on what the source datatype is. 5907 if (Idx == 0) { 5908 if (NumZero == 0) 5909 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5910 5911 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5912 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5913 if (VT.is256BitVector() || VT.is512BitVector()) { 5914 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 5915 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 5916 Item, DAG.getIntPtrConstant(0)); 5917 } 5918 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5919 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5920 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5921 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5922 } 5923 5924 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5925 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5926 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 5927 if (VT.is256BitVector()) { 5928 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 5929 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); 5930 } else { 5931 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5932 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5933 } 5934 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5935 } 5936 } 5937 5938 // Is it a vector logical left shift? 5939 if (NumElems == 2 && Idx == 1 && 5940 X86::isZeroNode(Op.getOperand(0)) && 5941 !X86::isZeroNode(Op.getOperand(1))) { 5942 unsigned NumBits = VT.getSizeInBits(); 5943 return getVShift(true, VT, 5944 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5945 VT, Op.getOperand(1)), 5946 NumBits/2, DAG, *this, dl); 5947 } 5948 5949 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5950 return SDValue(); 5951 5952 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5953 // is a non-constant being inserted into an element other than the low one, 5954 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5955 // movd/movss) to move this into the low element, then shuffle it into 5956 // place. 5957 if (EVTBits == 32) { 5958 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5959 5960 // Turn it into a shuffle of zero and zero-extended scalar to vector. 5961 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); 5962 SmallVector<int, 8> MaskVec; 5963 for (unsigned i = 0; i != NumElems; ++i) 5964 MaskVec.push_back(i == Idx ? 0 : 1); 5965 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 5966 } 5967 } 5968 5969 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5970 if (Values.size() == 1) { 5971 if (EVTBits == 32) { 5972 // Instead of a shuffle like this: 5973 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5974 // Check if it's possible to issue this instead. 5975 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5976 unsigned Idx = countTrailingZeros(NonZeros); 5977 SDValue Item = Op.getOperand(Idx); 5978 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5979 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5980 } 5981 return SDValue(); 5982 } 5983 5984 // A vector full of immediates; various special cases are already 5985 // handled, so this is best done with a single constant-pool load. 5986 if (IsAllConstants) 5987 return SDValue(); 5988 5989 // For AVX-length vectors, build the individual 128-bit pieces and use 5990 // shuffles to put them in place. 5991 if (VT.is256BitVector()) { 5992 SmallVector<SDValue, 32> V; 5993 for (unsigned i = 0; i != NumElems; ++i) 5994 V.push_back(Op.getOperand(i)); 5995 5996 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 5997 5998 // Build both the lower and upper subvector. 5999 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 6000 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 6001 NumElems/2); 6002 6003 // Recreate the wider vector with the lower and upper part. 6004 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 6005 } 6006 6007 // Let legalizer expand 2-wide build_vectors. 6008 if (EVTBits == 64) { 6009 if (NumNonZero == 1) { 6010 // One half is zero or undef. 6011 unsigned Idx = countTrailingZeros(NonZeros); 6012 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 6013 Op.getOperand(Idx)); 6014 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 6015 } 6016 return SDValue(); 6017 } 6018 6019 // If element VT is < 32 bits, convert it to inserts into a zero vector. 6020 if (EVTBits == 8 && NumElems == 16) { 6021 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 6022 Subtarget, *this); 6023 if (V.getNode()) return V; 6024 } 6025 6026 if (EVTBits == 16 && NumElems == 8) { 6027 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 6028 Subtarget, *this); 6029 if (V.getNode()) return V; 6030 } 6031 6032 // If element VT is == 32 bits, turn it into a number of shuffles. 6033 SmallVector<SDValue, 8> V(NumElems); 6034 if (NumElems == 4 && NumZero > 0) { 6035 for (unsigned i = 0; i < 4; ++i) { 6036 bool isZero = !(NonZeros & (1 << i)); 6037 if (isZero) 6038 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 6039 else 6040 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 6041 } 6042 6043 for (unsigned i = 0; i < 2; ++i) { 6044 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 6045 default: break; 6046 case 0: 6047 V[i] = V[i*2]; // Must be a zero vector. 6048 break; 6049 case 1: 6050 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 6051 break; 6052 case 2: 6053 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 6054 break; 6055 case 3: 6056 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 6057 break; 6058 } 6059 } 6060 6061 bool Reverse1 = (NonZeros & 0x3) == 2; 6062 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 6063 int MaskVec[] = { 6064 Reverse1 ? 1 : 0, 6065 Reverse1 ? 0 : 1, 6066 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 6067 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 6068 }; 6069 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 6070 } 6071 6072 if (Values.size() > 1 && VT.is128BitVector()) { 6073 // Check for a build vector of consecutive loads. 6074 for (unsigned i = 0; i < NumElems; ++i) 6075 V[i] = Op.getOperand(i); 6076 6077 // Check for elements which are consecutive loads. 6078 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 6079 if (LD.getNode()) 6080 return LD; 6081 6082 // Check for a build vector from mostly shuffle plus few inserting. 6083 SDValue Sh = buildFromShuffleMostly(Op, DAG); 6084 if (Sh.getNode()) 6085 return Sh; 6086 6087 // For SSE 4.1, use insertps to put the high elements into the low element. 6088 if (getSubtarget()->hasSSE41()) { 6089 SDValue Result; 6090 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 6091 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 6092 else 6093 Result = DAG.getUNDEF(VT); 6094 6095 for (unsigned i = 1; i < NumElems; ++i) { 6096 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 6097 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 6098 Op.getOperand(i), DAG.getIntPtrConstant(i)); 6099 } 6100 return Result; 6101 } 6102 6103 // Otherwise, expand into a number of unpckl*, start by extending each of 6104 // our (non-undef) elements to the full vector width with the element in the 6105 // bottom slot of the vector (which generates no code for SSE). 6106 for (unsigned i = 0; i < NumElems; ++i) { 6107 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 6108 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 6109 else 6110 V[i] = DAG.getUNDEF(VT); 6111 } 6112 6113 // Next, we iteratively mix elements, e.g. for v4f32: 6114 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 6115 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 6116 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 6117 unsigned EltStride = NumElems >> 1; 6118 while (EltStride != 0) { 6119 for (unsigned i = 0; i < EltStride; ++i) { 6120 // If V[i+EltStride] is undef and this is the first round of mixing, 6121 // then it is safe to just drop this shuffle: V[i] is already in the 6122 // right place, the one element (since it's the first round) being 6123 // inserted as undef can be dropped. This isn't safe for successive 6124 // rounds because they will permute elements within both vectors. 6125 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 6126 EltStride == NumElems/2) 6127 continue; 6128 6129 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 6130 } 6131 EltStride >>= 1; 6132 } 6133 return V[0]; 6134 } 6135 return SDValue(); 6136} 6137 6138// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 6139// to create 256-bit vectors from two other 128-bit ones. 6140static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6141 SDLoc dl(Op); 6142 MVT ResVT = Op.getSimpleValueType(); 6143 6144 assert((ResVT.is256BitVector() || 6145 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); 6146 6147 SDValue V1 = Op.getOperand(0); 6148 SDValue V2 = Op.getOperand(1); 6149 unsigned NumElems = ResVT.getVectorNumElements(); 6150 if(ResVT.is256BitVector()) 6151 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 6152 6153 return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 6154} 6155 6156static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 6157 assert(Op.getNumOperands() == 2); 6158 6159 // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors 6160 // from two other 128-bit ones. 6161 return LowerAVXCONCAT_VECTORS(Op, DAG); 6162} 6163 6164// Try to lower a shuffle node into a simple blend instruction. 6165static SDValue 6166LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, 6167 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 6168 SDValue V1 = SVOp->getOperand(0); 6169 SDValue V2 = SVOp->getOperand(1); 6170 SDLoc dl(SVOp); 6171 MVT VT = SVOp->getSimpleValueType(0); 6172 MVT EltVT = VT.getVectorElementType(); 6173 unsigned NumElems = VT.getVectorNumElements(); 6174 6175 // There is no blend with immediate in AVX-512. 6176 if (VT.is512BitVector()) 6177 return SDValue(); 6178 6179 if (!Subtarget->hasSSE41() || EltVT == MVT::i8) 6180 return SDValue(); 6181 if (!Subtarget->hasInt256() && VT == MVT::v16i16) 6182 return SDValue(); 6183 6184 // Check the mask for BLEND and build the value. 6185 unsigned MaskValue = 0; 6186 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. 6187 unsigned NumLanes = (NumElems-1)/8 + 1; 6188 unsigned NumElemsInLane = NumElems / NumLanes; 6189 6190 // Blend for v16i16 should be symetric for the both lanes. 6191 for (unsigned i = 0; i < NumElemsInLane; ++i) { 6192 6193 int SndLaneEltIdx = (NumLanes == 2) ? 6194 SVOp->getMaskElt(i + NumElemsInLane) : -1; 6195 int EltIdx = SVOp->getMaskElt(i); 6196 6197 if ((EltIdx < 0 || EltIdx == (int)i) && 6198 (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) 6199 continue; 6200 6201 if (((unsigned)EltIdx == (i + NumElems)) && 6202 (SndLaneEltIdx < 0 || 6203 (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) 6204 MaskValue |= (1<<i); 6205 else 6206 return SDValue(); 6207 } 6208 6209 // Convert i32 vectors to floating point if it is not AVX2. 6210 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. 6211 MVT BlendVT = VT; 6212 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { 6213 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), 6214 NumElems); 6215 V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); 6216 V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); 6217 } 6218 6219 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, 6220 DAG.getConstant(MaskValue, MVT::i32)); 6221 return DAG.getNode(ISD::BITCAST, dl, VT, Ret); 6222} 6223 6224// v8i16 shuffles - Prefer shuffles in the following order: 6225// 1. [all] pshuflw, pshufhw, optional move 6226// 2. [ssse3] 1 x pshufb 6227// 3. [ssse3] 2 x pshufb + 1 x por 6228// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 6229static SDValue 6230LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, 6231 SelectionDAG &DAG) { 6232 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6233 SDValue V1 = SVOp->getOperand(0); 6234 SDValue V2 = SVOp->getOperand(1); 6235 SDLoc dl(SVOp); 6236 SmallVector<int, 8> MaskVals; 6237 6238 // Determine if more than 1 of the words in each of the low and high quadwords 6239 // of the result come from the same quadword of one of the two inputs. Undef 6240 // mask values count as coming from any quadword, for better codegen. 6241 unsigned LoQuad[] = { 0, 0, 0, 0 }; 6242 unsigned HiQuad[] = { 0, 0, 0, 0 }; 6243 std::bitset<4> InputQuads; 6244 for (unsigned i = 0; i < 8; ++i) { 6245 unsigned *Quad = i < 4 ? LoQuad : HiQuad; 6246 int EltIdx = SVOp->getMaskElt(i); 6247 MaskVals.push_back(EltIdx); 6248 if (EltIdx < 0) { 6249 ++Quad[0]; 6250 ++Quad[1]; 6251 ++Quad[2]; 6252 ++Quad[3]; 6253 continue; 6254 } 6255 ++Quad[EltIdx / 4]; 6256 InputQuads.set(EltIdx / 4); 6257 } 6258 6259 int BestLoQuad = -1; 6260 unsigned MaxQuad = 1; 6261 for (unsigned i = 0; i < 4; ++i) { 6262 if (LoQuad[i] > MaxQuad) { 6263 BestLoQuad = i; 6264 MaxQuad = LoQuad[i]; 6265 } 6266 } 6267 6268 int BestHiQuad = -1; 6269 MaxQuad = 1; 6270 for (unsigned i = 0; i < 4; ++i) { 6271 if (HiQuad[i] > MaxQuad) { 6272 BestHiQuad = i; 6273 MaxQuad = HiQuad[i]; 6274 } 6275 } 6276 6277 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 6278 // of the two input vectors, shuffle them into one input vector so only a 6279 // single pshufb instruction is necessary. If There are more than 2 input 6280 // quads, disable the next transformation since it does not help SSSE3. 6281 bool V1Used = InputQuads[0] || InputQuads[1]; 6282 bool V2Used = InputQuads[2] || InputQuads[3]; 6283 if (Subtarget->hasSSSE3()) { 6284 if (InputQuads.count() == 2 && V1Used && V2Used) { 6285 BestLoQuad = InputQuads[0] ? 0 : 1; 6286 BestHiQuad = InputQuads[2] ? 2 : 3; 6287 } 6288 if (InputQuads.count() > 2) { 6289 BestLoQuad = -1; 6290 BestHiQuad = -1; 6291 } 6292 } 6293 6294 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 6295 // the shuffle mask. If a quad is scored as -1, that means that it contains 6296 // words from all 4 input quadwords. 6297 SDValue NewV; 6298 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 6299 int MaskV[] = { 6300 BestLoQuad < 0 ? 0 : BestLoQuad, 6301 BestHiQuad < 0 ? 1 : BestHiQuad 6302 }; 6303 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 6304 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 6305 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 6306 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 6307 6308 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 6309 // source words for the shuffle, to aid later transformations. 6310 bool AllWordsInNewV = true; 6311 bool InOrder[2] = { true, true }; 6312 for (unsigned i = 0; i != 8; ++i) { 6313 int idx = MaskVals[i]; 6314 if (idx != (int)i) 6315 InOrder[i/4] = false; 6316 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 6317 continue; 6318 AllWordsInNewV = false; 6319 break; 6320 } 6321 6322 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 6323 if (AllWordsInNewV) { 6324 for (int i = 0; i != 8; ++i) { 6325 int idx = MaskVals[i]; 6326 if (idx < 0) 6327 continue; 6328 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 6329 if ((idx != i) && idx < 4) 6330 pshufhw = false; 6331 if ((idx != i) && idx > 3) 6332 pshuflw = false; 6333 } 6334 V1 = NewV; 6335 V2Used = false; 6336 BestLoQuad = 0; 6337 BestHiQuad = 1; 6338 } 6339 6340 // If we've eliminated the use of V2, and the new mask is a pshuflw or 6341 // pshufhw, that's as cheap as it gets. Return the new shuffle. 6342 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 6343 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 6344 unsigned TargetMask = 0; 6345 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 6346 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 6347 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 6348 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): 6349 getShufflePSHUFLWImmediate(SVOp); 6350 V1 = NewV.getOperand(0); 6351 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 6352 } 6353 } 6354 6355 // Promote splats to a larger type which usually leads to more efficient code. 6356 // FIXME: Is this true if pshufb is available? 6357 if (SVOp->isSplat()) 6358 return PromoteSplat(SVOp, DAG); 6359 6360 // If we have SSSE3, and all words of the result are from 1 input vector, 6361 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 6362 // is present, fall back to case 4. 6363 if (Subtarget->hasSSSE3()) { 6364 SmallVector<SDValue,16> pshufbMask; 6365 6366 // If we have elements from both input vectors, set the high bit of the 6367 // shuffle mask element to zero out elements that come from V2 in the V1 6368 // mask, and elements that come from V1 in the V2 mask, so that the two 6369 // results can be OR'd together. 6370 bool TwoInputs = V1Used && V2Used; 6371 for (unsigned i = 0; i != 8; ++i) { 6372 int EltIdx = MaskVals[i] * 2; 6373 int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; 6374 int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; 6375 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 6376 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 6377 } 6378 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 6379 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 6380 DAG.getNode(ISD::BUILD_VECTOR, dl, 6381 MVT::v16i8, &pshufbMask[0], 16)); 6382 if (!TwoInputs) 6383 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6384 6385 // Calculate the shuffle mask for the second input, shuffle it, and 6386 // OR it with the first shuffled input. 6387 pshufbMask.clear(); 6388 for (unsigned i = 0; i != 8; ++i) { 6389 int EltIdx = MaskVals[i] * 2; 6390 int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; 6391 int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; 6392 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 6393 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 6394 } 6395 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 6396 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 6397 DAG.getNode(ISD::BUILD_VECTOR, dl, 6398 MVT::v16i8, &pshufbMask[0], 16)); 6399 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 6400 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6401 } 6402 6403 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 6404 // and update MaskVals with new element order. 6405 std::bitset<8> InOrder; 6406 if (BestLoQuad >= 0) { 6407 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; 6408 for (int i = 0; i != 4; ++i) { 6409 int idx = MaskVals[i]; 6410 if (idx < 0) { 6411 InOrder.set(i); 6412 } else if ((idx / 4) == BestLoQuad) { 6413 MaskV[i] = idx & 3; 6414 InOrder.set(i); 6415 } 6416 } 6417 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 6418 &MaskV[0]); 6419 6420 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 6421 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 6422 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 6423 NewV.getOperand(0), 6424 getShufflePSHUFLWImmediate(SVOp), DAG); 6425 } 6426 } 6427 6428 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 6429 // and update MaskVals with the new element order. 6430 if (BestHiQuad >= 0) { 6431 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; 6432 for (unsigned i = 4; i != 8; ++i) { 6433 int idx = MaskVals[i]; 6434 if (idx < 0) { 6435 InOrder.set(i); 6436 } else if ((idx / 4) == BestHiQuad) { 6437 MaskV[i] = (idx & 3) + 4; 6438 InOrder.set(i); 6439 } 6440 } 6441 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 6442 &MaskV[0]); 6443 6444 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 6445 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 6446 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 6447 NewV.getOperand(0), 6448 getShufflePSHUFHWImmediate(SVOp), DAG); 6449 } 6450 } 6451 6452 // In case BestHi & BestLo were both -1, which means each quadword has a word 6453 // from each of the four input quadwords, calculate the InOrder bitvector now 6454 // before falling through to the insert/extract cleanup. 6455 if (BestLoQuad == -1 && BestHiQuad == -1) { 6456 NewV = V1; 6457 for (int i = 0; i != 8; ++i) 6458 if (MaskVals[i] < 0 || MaskVals[i] == i) 6459 InOrder.set(i); 6460 } 6461 6462 // The other elements are put in the right place using pextrw and pinsrw. 6463 for (unsigned i = 0; i != 8; ++i) { 6464 if (InOrder[i]) 6465 continue; 6466 int EltIdx = MaskVals[i]; 6467 if (EltIdx < 0) 6468 continue; 6469 SDValue ExtOp = (EltIdx < 8) ? 6470 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 6471 DAG.getIntPtrConstant(EltIdx)) : 6472 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 6473 DAG.getIntPtrConstant(EltIdx - 8)); 6474 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 6475 DAG.getIntPtrConstant(i)); 6476 } 6477 return NewV; 6478} 6479 6480// v16i8 shuffles - Prefer shuffles in the following order: 6481// 1. [ssse3] 1 x pshufb 6482// 2. [ssse3] 2 x pshufb + 1 x por 6483// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 6484static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 6485 const X86Subtarget* Subtarget, 6486 SelectionDAG &DAG) { 6487 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6488 SDValue V1 = SVOp->getOperand(0); 6489 SDValue V2 = SVOp->getOperand(1); 6490 SDLoc dl(SVOp); 6491 ArrayRef<int> MaskVals = SVOp->getMask(); 6492 6493 // Promote splats to a larger type which usually leads to more efficient code. 6494 // FIXME: Is this true if pshufb is available? 6495 if (SVOp->isSplat()) 6496 return PromoteSplat(SVOp, DAG); 6497 6498 // If we have SSSE3, case 1 is generated when all result bytes come from 6499 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 6500 // present, fall back to case 3. 6501 6502 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 6503 if (Subtarget->hasSSSE3()) { 6504 SmallVector<SDValue,16> pshufbMask; 6505 6506 // If all result elements are from one input vector, then only translate 6507 // undef mask values to 0x80 (zero out result) in the pshufb mask. 6508 // 6509 // Otherwise, we have elements from both input vectors, and must zero out 6510 // elements that come from V2 in the first mask, and V1 in the second mask 6511 // so that we can OR them together. 6512 for (unsigned i = 0; i != 16; ++i) { 6513 int EltIdx = MaskVals[i]; 6514 if (EltIdx < 0 || EltIdx >= 16) 6515 EltIdx = 0x80; 6516 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6517 } 6518 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 6519 DAG.getNode(ISD::BUILD_VECTOR, dl, 6520 MVT::v16i8, &pshufbMask[0], 16)); 6521 6522 // As PSHUFB will zero elements with negative indices, it's safe to ignore 6523 // the 2nd operand if it's undefined or zero. 6524 if (V2.getOpcode() == ISD::UNDEF || 6525 ISD::isBuildVectorAllZeros(V2.getNode())) 6526 return V1; 6527 6528 // Calculate the shuffle mask for the second input, shuffle it, and 6529 // OR it with the first shuffled input. 6530 pshufbMask.clear(); 6531 for (unsigned i = 0; i != 16; ++i) { 6532 int EltIdx = MaskVals[i]; 6533 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; 6534 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6535 } 6536 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 6537 DAG.getNode(ISD::BUILD_VECTOR, dl, 6538 MVT::v16i8, &pshufbMask[0], 16)); 6539 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 6540 } 6541 6542 // No SSSE3 - Calculate in place words and then fix all out of place words 6543 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 6544 // the 16 different words that comprise the two doublequadword input vectors. 6545 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6546 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 6547 SDValue NewV = V1; 6548 for (int i = 0; i != 8; ++i) { 6549 int Elt0 = MaskVals[i*2]; 6550 int Elt1 = MaskVals[i*2+1]; 6551 6552 // This word of the result is all undef, skip it. 6553 if (Elt0 < 0 && Elt1 < 0) 6554 continue; 6555 6556 // This word of the result is already in the correct place, skip it. 6557 if ((Elt0 == i*2) && (Elt1 == i*2+1)) 6558 continue; 6559 6560 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 6561 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 6562 SDValue InsElt; 6563 6564 // If Elt0 and Elt1 are defined, are consecutive, and can be load 6565 // using a single extract together, load it and store it. 6566 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 6567 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 6568 DAG.getIntPtrConstant(Elt1 / 2)); 6569 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 6570 DAG.getIntPtrConstant(i)); 6571 continue; 6572 } 6573 6574 // If Elt1 is defined, extract it from the appropriate source. If the 6575 // source byte is not also odd, shift the extracted word left 8 bits 6576 // otherwise clear the bottom 8 bits if we need to do an or. 6577 if (Elt1 >= 0) { 6578 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 6579 DAG.getIntPtrConstant(Elt1 / 2)); 6580 if ((Elt1 & 1) == 0) 6581 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 6582 DAG.getConstant(8, 6583 TLI.getShiftAmountTy(InsElt.getValueType()))); 6584 else if (Elt0 >= 0) 6585 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 6586 DAG.getConstant(0xFF00, MVT::i16)); 6587 } 6588 // If Elt0 is defined, extract it from the appropriate source. If the 6589 // source byte is not also even, shift the extracted word right 8 bits. If 6590 // Elt1 was also defined, OR the extracted values together before 6591 // inserting them in the result. 6592 if (Elt0 >= 0) { 6593 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 6594 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 6595 if ((Elt0 & 1) != 0) 6596 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 6597 DAG.getConstant(8, 6598 TLI.getShiftAmountTy(InsElt0.getValueType()))); 6599 else if (Elt1 >= 0) 6600 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 6601 DAG.getConstant(0x00FF, MVT::i16)); 6602 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 6603 : InsElt0; 6604 } 6605 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 6606 DAG.getIntPtrConstant(i)); 6607 } 6608 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 6609} 6610 6611// v32i8 shuffles - Translate to VPSHUFB if possible. 6612static 6613SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, 6614 const X86Subtarget *Subtarget, 6615 SelectionDAG &DAG) { 6616 MVT VT = SVOp->getSimpleValueType(0); 6617 SDValue V1 = SVOp->getOperand(0); 6618 SDValue V2 = SVOp->getOperand(1); 6619 SDLoc dl(SVOp); 6620 SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); 6621 6622 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6623 bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); 6624 bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); 6625 6626 // VPSHUFB may be generated if 6627 // (1) one of input vector is undefined or zeroinitializer. 6628 // The mask value 0x80 puts 0 in the corresponding slot of the vector. 6629 // And (2) the mask indexes don't cross the 128-bit lane. 6630 if (VT != MVT::v32i8 || !Subtarget->hasInt256() || 6631 (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) 6632 return SDValue(); 6633 6634 if (V1IsAllZero && !V2IsAllZero) { 6635 CommuteVectorShuffleMask(MaskVals, 32); 6636 V1 = V2; 6637 } 6638 SmallVector<SDValue, 32> pshufbMask; 6639 for (unsigned i = 0; i != 32; i++) { 6640 int EltIdx = MaskVals[i]; 6641 if (EltIdx < 0 || EltIdx >= 32) 6642 EltIdx = 0x80; 6643 else { 6644 if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) 6645 // Cross lane is not allowed. 6646 return SDValue(); 6647 EltIdx &= 0xf; 6648 } 6649 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6650 } 6651 return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, 6652 DAG.getNode(ISD::BUILD_VECTOR, dl, 6653 MVT::v32i8, &pshufbMask[0], 32)); 6654} 6655 6656/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 6657/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 6658/// done when every pair / quad of shuffle mask elements point to elements in 6659/// the right sequence. e.g. 6660/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 6661static 6662SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 6663 SelectionDAG &DAG) { 6664 MVT VT = SVOp->getSimpleValueType(0); 6665 SDLoc dl(SVOp); 6666 unsigned NumElems = VT.getVectorNumElements(); 6667 MVT NewVT; 6668 unsigned Scale; 6669 switch (VT.SimpleTy) { 6670 default: llvm_unreachable("Unexpected!"); 6671 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; 6672 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; 6673 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; 6674 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; 6675 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; 6676 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; 6677 } 6678 6679 SmallVector<int, 8> MaskVec; 6680 for (unsigned i = 0; i != NumElems; i += Scale) { 6681 int StartIdx = -1; 6682 for (unsigned j = 0; j != Scale; ++j) { 6683 int EltIdx = SVOp->getMaskElt(i+j); 6684 if (EltIdx < 0) 6685 continue; 6686 if (StartIdx < 0) 6687 StartIdx = (EltIdx / Scale); 6688 if (EltIdx != (int)(StartIdx*Scale + j)) 6689 return SDValue(); 6690 } 6691 MaskVec.push_back(StartIdx); 6692 } 6693 6694 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); 6695 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); 6696 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 6697} 6698 6699/// getVZextMovL - Return a zero-extending vector move low node. 6700/// 6701static SDValue getVZextMovL(MVT VT, MVT OpVT, 6702 SDValue SrcOp, SelectionDAG &DAG, 6703 const X86Subtarget *Subtarget, SDLoc dl) { 6704 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 6705 LoadSDNode *LD = NULL; 6706 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 6707 LD = dyn_cast<LoadSDNode>(SrcOp); 6708 if (!LD) { 6709 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 6710 // instead. 6711 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 6712 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 6713 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 6714 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 6715 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 6716 // PR2108 6717 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 6718 return DAG.getNode(ISD::BITCAST, dl, VT, 6719 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6720 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6721 OpVT, 6722 SrcOp.getOperand(0) 6723 .getOperand(0)))); 6724 } 6725 } 6726 } 6727 6728 return DAG.getNode(ISD::BITCAST, dl, VT, 6729 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6730 DAG.getNode(ISD::BITCAST, dl, 6731 OpVT, SrcOp))); 6732} 6733 6734/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 6735/// which could not be matched by any known target speficic shuffle 6736static SDValue 6737LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6738 6739 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); 6740 if (NewOp.getNode()) 6741 return NewOp; 6742 6743 MVT VT = SVOp->getSimpleValueType(0); 6744 6745 unsigned NumElems = VT.getVectorNumElements(); 6746 unsigned NumLaneElems = NumElems / 2; 6747 6748 SDLoc dl(SVOp); 6749 MVT EltVT = VT.getVectorElementType(); 6750 MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); 6751 SDValue Output[2]; 6752 6753 SmallVector<int, 16> Mask; 6754 for (unsigned l = 0; l < 2; ++l) { 6755 // Build a shuffle mask for the output, discovering on the fly which 6756 // input vectors to use as shuffle operands (recorded in InputUsed). 6757 // If building a suitable shuffle vector proves too hard, then bail 6758 // out with UseBuildVector set. 6759 bool UseBuildVector = false; 6760 int InputUsed[2] = { -1, -1 }; // Not yet discovered. 6761 unsigned LaneStart = l * NumLaneElems; 6762 for (unsigned i = 0; i != NumLaneElems; ++i) { 6763 // The mask element. This indexes into the input. 6764 int Idx = SVOp->getMaskElt(i+LaneStart); 6765 if (Idx < 0) { 6766 // the mask element does not index into any input vector. 6767 Mask.push_back(-1); 6768 continue; 6769 } 6770 6771 // The input vector this mask element indexes into. 6772 int Input = Idx / NumLaneElems; 6773 6774 // Turn the index into an offset from the start of the input vector. 6775 Idx -= Input * NumLaneElems; 6776 6777 // Find or create a shuffle vector operand to hold this input. 6778 unsigned OpNo; 6779 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { 6780 if (InputUsed[OpNo] == Input) 6781 // This input vector is already an operand. 6782 break; 6783 if (InputUsed[OpNo] < 0) { 6784 // Create a new operand for this input vector. 6785 InputUsed[OpNo] = Input; 6786 break; 6787 } 6788 } 6789 6790 if (OpNo >= array_lengthof(InputUsed)) { 6791 // More than two input vectors used! Give up on trying to create a 6792 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 6793 UseBuildVector = true; 6794 break; 6795 } 6796 6797 // Add the mask index for the new shuffle vector. 6798 Mask.push_back(Idx + OpNo * NumLaneElems); 6799 } 6800 6801 if (UseBuildVector) { 6802 SmallVector<SDValue, 16> SVOps; 6803 for (unsigned i = 0; i != NumLaneElems; ++i) { 6804 // The mask element. This indexes into the input. 6805 int Idx = SVOp->getMaskElt(i+LaneStart); 6806 if (Idx < 0) { 6807 SVOps.push_back(DAG.getUNDEF(EltVT)); 6808 continue; 6809 } 6810 6811 // The input vector this mask element indexes into. 6812 int Input = Idx / NumElems; 6813 6814 // Turn the index into an offset from the start of the input vector. 6815 Idx -= Input * NumElems; 6816 6817 // Extract the vector element by hand. 6818 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 6819 SVOp->getOperand(Input), 6820 DAG.getIntPtrConstant(Idx))); 6821 } 6822 6823 // Construct the output using a BUILD_VECTOR. 6824 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], 6825 SVOps.size()); 6826 } else if (InputUsed[0] < 0) { 6827 // No input vectors were used! The result is undefined. 6828 Output[l] = DAG.getUNDEF(NVT); 6829 } else { 6830 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), 6831 (InputUsed[0] % 2) * NumLaneElems, 6832 DAG, dl); 6833 // If only one input was used, use an undefined vector for the other. 6834 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : 6835 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), 6836 (InputUsed[1] % 2) * NumLaneElems, DAG, dl); 6837 // At least one input vector was used. Create a new shuffle vector. 6838 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); 6839 } 6840 6841 Mask.clear(); 6842 } 6843 6844 // Concatenate the result back 6845 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); 6846} 6847 6848/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 6849/// 4 elements, and match them with several different shuffle types. 6850static SDValue 6851LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6852 SDValue V1 = SVOp->getOperand(0); 6853 SDValue V2 = SVOp->getOperand(1); 6854 SDLoc dl(SVOp); 6855 MVT VT = SVOp->getSimpleValueType(0); 6856 6857 assert(VT.is128BitVector() && "Unsupported vector size"); 6858 6859 std::pair<int, int> Locs[4]; 6860 int Mask1[] = { -1, -1, -1, -1 }; 6861 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); 6862 6863 unsigned NumHi = 0; 6864 unsigned NumLo = 0; 6865 for (unsigned i = 0; i != 4; ++i) { 6866 int Idx = PermMask[i]; 6867 if (Idx < 0) { 6868 Locs[i] = std::make_pair(-1, -1); 6869 } else { 6870 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 6871 if (Idx < 4) { 6872 Locs[i] = std::make_pair(0, NumLo); 6873 Mask1[NumLo] = Idx; 6874 NumLo++; 6875 } else { 6876 Locs[i] = std::make_pair(1, NumHi); 6877 if (2+NumHi < 4) 6878 Mask1[2+NumHi] = Idx; 6879 NumHi++; 6880 } 6881 } 6882 } 6883 6884 if (NumLo <= 2 && NumHi <= 2) { 6885 // If no more than two elements come from either vector. This can be 6886 // implemented with two shuffles. First shuffle gather the elements. 6887 // The second shuffle, which takes the first shuffle as both of its 6888 // vector operands, put the elements into the right order. 6889 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6890 6891 int Mask2[] = { -1, -1, -1, -1 }; 6892 6893 for (unsigned i = 0; i != 4; ++i) 6894 if (Locs[i].first != -1) { 6895 unsigned Idx = (i < 2) ? 0 : 4; 6896 Idx += Locs[i].first * 2 + Locs[i].second; 6897 Mask2[i] = Idx; 6898 } 6899 6900 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 6901 } 6902 6903 if (NumLo == 3 || NumHi == 3) { 6904 // Otherwise, we must have three elements from one vector, call it X, and 6905 // one element from the other, call it Y. First, use a shufps to build an 6906 // intermediate vector with the one element from Y and the element from X 6907 // that will be in the same half in the final destination (the indexes don't 6908 // matter). Then, use a shufps to build the final vector, taking the half 6909 // containing the element from Y from the intermediate, and the other half 6910 // from X. 6911 if (NumHi == 3) { 6912 // Normalize it so the 3 elements come from V1. 6913 CommuteVectorShuffleMask(PermMask, 4); 6914 std::swap(V1, V2); 6915 } 6916 6917 // Find the element from V2. 6918 unsigned HiIndex; 6919 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 6920 int Val = PermMask[HiIndex]; 6921 if (Val < 0) 6922 continue; 6923 if (Val >= 4) 6924 break; 6925 } 6926 6927 Mask1[0] = PermMask[HiIndex]; 6928 Mask1[1] = -1; 6929 Mask1[2] = PermMask[HiIndex^1]; 6930 Mask1[3] = -1; 6931 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6932 6933 if (HiIndex >= 2) { 6934 Mask1[0] = PermMask[0]; 6935 Mask1[1] = PermMask[1]; 6936 Mask1[2] = HiIndex & 1 ? 6 : 4; 6937 Mask1[3] = HiIndex & 1 ? 4 : 6; 6938 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6939 } 6940 6941 Mask1[0] = HiIndex & 1 ? 2 : 0; 6942 Mask1[1] = HiIndex & 1 ? 0 : 2; 6943 Mask1[2] = PermMask[2]; 6944 Mask1[3] = PermMask[3]; 6945 if (Mask1[2] >= 0) 6946 Mask1[2] += 4; 6947 if (Mask1[3] >= 0) 6948 Mask1[3] += 4; 6949 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 6950 } 6951 6952 // Break it into (shuffle shuffle_hi, shuffle_lo). 6953 int LoMask[] = { -1, -1, -1, -1 }; 6954 int HiMask[] = { -1, -1, -1, -1 }; 6955 6956 int *MaskPtr = LoMask; 6957 unsigned MaskIdx = 0; 6958 unsigned LoIdx = 0; 6959 unsigned HiIdx = 2; 6960 for (unsigned i = 0; i != 4; ++i) { 6961 if (i == 2) { 6962 MaskPtr = HiMask; 6963 MaskIdx = 1; 6964 LoIdx = 0; 6965 HiIdx = 2; 6966 } 6967 int Idx = PermMask[i]; 6968 if (Idx < 0) { 6969 Locs[i] = std::make_pair(-1, -1); 6970 } else if (Idx < 4) { 6971 Locs[i] = std::make_pair(MaskIdx, LoIdx); 6972 MaskPtr[LoIdx] = Idx; 6973 LoIdx++; 6974 } else { 6975 Locs[i] = std::make_pair(MaskIdx, HiIdx); 6976 MaskPtr[HiIdx] = Idx; 6977 HiIdx++; 6978 } 6979 } 6980 6981 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 6982 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 6983 int MaskOps[] = { -1, -1, -1, -1 }; 6984 for (unsigned i = 0; i != 4; ++i) 6985 if (Locs[i].first != -1) 6986 MaskOps[i] = Locs[i].first * 4 + Locs[i].second; 6987 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 6988} 6989 6990static bool MayFoldVectorLoad(SDValue V) { 6991 while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6992 V = V.getOperand(0); 6993 6994 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6995 V = V.getOperand(0); 6996 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && 6997 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) 6998 // BUILD_VECTOR (load), undef 6999 V = V.getOperand(0); 7000 7001 return MayFoldLoad(V); 7002} 7003 7004static 7005SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { 7006 MVT VT = Op.getSimpleValueType(); 7007 7008 // Canonizalize to v2f64. 7009 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 7010 return DAG.getNode(ISD::BITCAST, dl, VT, 7011 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 7012 V1, DAG)); 7013} 7014 7015static 7016SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, 7017 bool HasSSE2) { 7018 SDValue V1 = Op.getOperand(0); 7019 SDValue V2 = Op.getOperand(1); 7020 MVT VT = Op.getSimpleValueType(); 7021 7022 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 7023 7024 if (HasSSE2 && VT == MVT::v2f64) 7025 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 7026 7027 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) 7028 return DAG.getNode(ISD::BITCAST, dl, VT, 7029 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, 7030 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), 7031 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); 7032} 7033 7034static 7035SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) { 7036 SDValue V1 = Op.getOperand(0); 7037 SDValue V2 = Op.getOperand(1); 7038 MVT VT = Op.getSimpleValueType(); 7039 7040 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 7041 "unsupported shuffle type"); 7042 7043 if (V2.getOpcode() == ISD::UNDEF) 7044 V2 = V1; 7045 7046 // v4i32 or v4f32 7047 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 7048} 7049 7050static 7051SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 7052 SDValue V1 = Op.getOperand(0); 7053 SDValue V2 = Op.getOperand(1); 7054 MVT VT = Op.getSimpleValueType(); 7055 unsigned NumElems = VT.getVectorNumElements(); 7056 7057 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 7058 // operand of these instructions is only memory, so check if there's a 7059 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 7060 // same masks. 7061 bool CanFoldLoad = false; 7062 7063 // Trivial case, when V2 comes from a load. 7064 if (MayFoldVectorLoad(V2)) 7065 CanFoldLoad = true; 7066 7067 // When V1 is a load, it can be folded later into a store in isel, example: 7068 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 7069 // turns into: 7070 // (MOVLPSmr addr:$src1, VR128:$src2) 7071 // So, recognize this potential and also use MOVLPS or MOVLPD 7072 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 7073 CanFoldLoad = true; 7074 7075 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7076 if (CanFoldLoad) { 7077 if (HasSSE2 && NumElems == 2) 7078 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 7079 7080 if (NumElems == 4) 7081 // If we don't care about the second element, proceed to use movss. 7082 if (SVOp->getMaskElt(1) != -1) 7083 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 7084 } 7085 7086 // movl and movlp will both match v2i64, but v2i64 is never matched by 7087 // movl earlier because we make it strict to avoid messing with the movlp load 7088 // folding logic (see the code above getMOVLP call). Match it here then, 7089 // this is horrible, but will stay like this until we move all shuffle 7090 // matching to x86 specific nodes. Note that for the 1st condition all 7091 // types are matched with movsd. 7092 if (HasSSE2) { 7093 // FIXME: isMOVLMask should be checked and matched before getMOVLP, 7094 // as to remove this logic from here, as much as possible 7095 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) 7096 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 7097 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 7098 } 7099 7100 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 7101 7102 // Invert the operand order and use SHUFPS to match it. 7103 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, 7104 getShuffleSHUFImmediate(SVOp), DAG); 7105} 7106 7107// Reduce a vector shuffle to zext. 7108static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, 7109 SelectionDAG &DAG) { 7110 // PMOVZX is only available from SSE41. 7111 if (!Subtarget->hasSSE41()) 7112 return SDValue(); 7113 7114 MVT VT = Op.getSimpleValueType(); 7115 7116 // Only AVX2 support 256-bit vector integer extending. 7117 if (!Subtarget->hasInt256() && VT.is256BitVector()) 7118 return SDValue(); 7119 7120 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7121 SDLoc DL(Op); 7122 SDValue V1 = Op.getOperand(0); 7123 SDValue V2 = Op.getOperand(1); 7124 unsigned NumElems = VT.getVectorNumElements(); 7125 7126 // Extending is an unary operation and the element type of the source vector 7127 // won't be equal to or larger than i64. 7128 if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || 7129 VT.getVectorElementType() == MVT::i64) 7130 return SDValue(); 7131 7132 // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. 7133 unsigned Shift = 1; // Start from 2, i.e. 1 << 1. 7134 while ((1U << Shift) < NumElems) { 7135 if (SVOp->getMaskElt(1U << Shift) == 1) 7136 break; 7137 Shift += 1; 7138 // The maximal ratio is 8, i.e. from i8 to i64. 7139 if (Shift > 3) 7140 return SDValue(); 7141 } 7142 7143 // Check the shuffle mask. 7144 unsigned Mask = (1U << Shift) - 1; 7145 for (unsigned i = 0; i != NumElems; ++i) { 7146 int EltIdx = SVOp->getMaskElt(i); 7147 if ((i & Mask) != 0 && EltIdx != -1) 7148 return SDValue(); 7149 if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) 7150 return SDValue(); 7151 } 7152 7153 unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; 7154 MVT NeVT = MVT::getIntegerVT(NBits); 7155 MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift); 7156 7157 if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) 7158 return SDValue(); 7159 7160 // Simplify the operand as it's prepared to be fed into shuffle. 7161 unsigned SignificantBits = NVT.getSizeInBits() >> Shift; 7162 if (V1.getOpcode() == ISD::BITCAST && 7163 V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 7164 V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7165 V1.getOperand(0).getOperand(0) 7166 .getSimpleValueType().getSizeInBits() == SignificantBits) { 7167 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) 7168 SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); 7169 ConstantSDNode *CIdx = 7170 dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); 7171 // If it's foldable, i.e. normal load with single use, we will let code 7172 // selection to fold it. Otherwise, we will short the conversion sequence. 7173 if (CIdx && CIdx->getZExtValue() == 0 && 7174 (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { 7175 MVT FullVT = V.getSimpleValueType(); 7176 MVT V1VT = V1.getSimpleValueType(); 7177 if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) { 7178 // The "ext_vec_elt" node is wider than the result node. 7179 // In this case we should extract subvector from V. 7180 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). 7181 unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits(); 7182 MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(), 7183 FullVT.getVectorNumElements()/Ratio); 7184 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, 7185 DAG.getIntPtrConstant(0)); 7186 } 7187 V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V); 7188 } 7189 } 7190 7191 return DAG.getNode(ISD::BITCAST, DL, VT, 7192 DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); 7193} 7194 7195static SDValue 7196NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, 7197 SelectionDAG &DAG) { 7198 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7199 MVT VT = Op.getSimpleValueType(); 7200 SDLoc dl(Op); 7201 SDValue V1 = Op.getOperand(0); 7202 SDValue V2 = Op.getOperand(1); 7203 7204 if (isZeroShuffle(SVOp)) 7205 return getZeroVector(VT, Subtarget, DAG, dl); 7206 7207 // Handle splat operations 7208 if (SVOp->isSplat()) { 7209 // Use vbroadcast whenever the splat comes from a foldable load 7210 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); 7211 if (Broadcast.getNode()) 7212 return Broadcast; 7213 } 7214 7215 // Check integer expanding shuffles. 7216 SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG); 7217 if (NewOp.getNode()) 7218 return NewOp; 7219 7220 // If the shuffle can be profitably rewritten as a narrower shuffle, then 7221 // do it! 7222 if (VT == MVT::v8i16 || VT == MVT::v16i8 || 7223 VT == MVT::v16i16 || VT == MVT::v32i8) { 7224 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 7225 if (NewOp.getNode()) 7226 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 7227 } else if ((VT == MVT::v4i32 || 7228 (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 7229 // FIXME: Figure out a cleaner way to do this. 7230 // Try to make use of movq to zero out the top part. 7231 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 7232 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 7233 if (NewOp.getNode()) { 7234 MVT NewVT = NewOp.getSimpleValueType(); 7235 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), 7236 NewVT, true, false)) 7237 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), 7238 DAG, Subtarget, dl); 7239 } 7240 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 7241 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 7242 if (NewOp.getNode()) { 7243 MVT NewVT = NewOp.getSimpleValueType(); 7244 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) 7245 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), 7246 DAG, Subtarget, dl); 7247 } 7248 } 7249 } 7250 return SDValue(); 7251} 7252 7253SDValue 7254X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 7255 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7256 SDValue V1 = Op.getOperand(0); 7257 SDValue V2 = Op.getOperand(1); 7258 MVT VT = Op.getSimpleValueType(); 7259 SDLoc dl(Op); 7260 unsigned NumElems = VT.getVectorNumElements(); 7261 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 7262 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 7263 bool V1IsSplat = false; 7264 bool V2IsSplat = false; 7265 bool HasSSE2 = Subtarget->hasSSE2(); 7266 bool HasFp256 = Subtarget->hasFp256(); 7267 bool HasInt256 = Subtarget->hasInt256(); 7268 MachineFunction &MF = DAG.getMachineFunction(); 7269 bool OptForSize = MF.getFunction()->getAttributes(). 7270 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); 7271 7272 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 7273 7274 if (V1IsUndef && V2IsUndef) 7275 return DAG.getUNDEF(VT); 7276 7277 assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); 7278 7279 // Vector shuffle lowering takes 3 steps: 7280 // 7281 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 7282 // narrowing and commutation of operands should be handled. 7283 // 2) Matching of shuffles with known shuffle masks to x86 target specific 7284 // shuffle nodes. 7285 // 3) Rewriting of unmatched masks into new generic shuffle operations, 7286 // so the shuffle can be broken into other shuffles and the legalizer can 7287 // try the lowering again. 7288 // 7289 // The general idea is that no vector_shuffle operation should be left to 7290 // be matched during isel, all of them must be converted to a target specific 7291 // node here. 7292 7293 // Normalize the input vectors. Here splats, zeroed vectors, profitable 7294 // narrowing and commutation of operands should be handled. The actual code 7295 // doesn't include all of those, work in progress... 7296 SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG); 7297 if (NewOp.getNode()) 7298 return NewOp; 7299 7300 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); 7301 7302 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 7303 // unpckh_undef). Only use pshufd if speed is more important than size. 7304 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 7305 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 7306 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 7307 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 7308 7309 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && 7310 V2IsUndef && MayFoldVectorLoad(V1)) 7311 return getMOVDDup(Op, dl, V1, DAG); 7312 7313 if (isMOVHLPS_v_undef_Mask(M, VT)) 7314 return getMOVHighToLow(Op, dl, DAG); 7315 7316 // Use to match splats 7317 if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && 7318 (VT == MVT::v2f64 || VT == MVT::v2i64)) 7319 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 7320 7321 if (isPSHUFDMask(M, VT)) { 7322 // The actual implementation will match the mask in the if above and then 7323 // during isel it can match several different instructions, not only pshufd 7324 // as its name says, sad but true, emulate the behavior for now... 7325 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 7326 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 7327 7328 unsigned TargetMask = getShuffleSHUFImmediate(SVOp); 7329 7330 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 7331 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 7332 7333 if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) 7334 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, 7335 DAG); 7336 7337 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, 7338 TargetMask, DAG); 7339 } 7340 7341 if (isPALIGNRMask(M, VT, Subtarget)) 7342 return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, 7343 getShufflePALIGNRImmediate(SVOp), 7344 DAG); 7345 7346 // Check if this can be converted into a logical shift. 7347 bool isLeft = false; 7348 unsigned ShAmt = 0; 7349 SDValue ShVal; 7350 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 7351 if (isShift && ShVal.hasOneUse()) { 7352 // If the shifted value has multiple uses, it may be cheaper to use 7353 // v_set0 + movlhps or movhlps, etc. 7354 MVT EltVT = VT.getVectorElementType(); 7355 ShAmt *= EltVT.getSizeInBits(); 7356 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 7357 } 7358 7359 if (isMOVLMask(M, VT)) { 7360 if (ISD::isBuildVectorAllZeros(V1.getNode())) 7361 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 7362 if (!isMOVLPMask(M, VT)) { 7363 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 7364 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 7365 7366 if (VT == MVT::v4i32 || VT == MVT::v4f32) 7367 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 7368 } 7369 } 7370 7371 // FIXME: fold these into legal mask. 7372 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) 7373 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 7374 7375 if (isMOVHLPSMask(M, VT)) 7376 return getMOVHighToLow(Op, dl, DAG); 7377 7378 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) 7379 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 7380 7381 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) 7382 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 7383 7384 if (isMOVLPMask(M, VT)) 7385 return getMOVLP(Op, dl, DAG, HasSSE2); 7386 7387 if (ShouldXformToMOVHLPS(M, VT) || 7388 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) 7389 return CommuteVectorShuffle(SVOp, DAG); 7390 7391 if (isShift) { 7392 // No better options. Use a vshldq / vsrldq. 7393 MVT EltVT = VT.getVectorElementType(); 7394 ShAmt *= EltVT.getSizeInBits(); 7395 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 7396 } 7397 7398 bool Commuted = false; 7399 // FIXME: This should also accept a bitcast of a splat? Be careful, not 7400 // 1,1,1,1 -> v8i16 though. 7401 V1IsSplat = isSplatVector(V1.getNode()); 7402 V2IsSplat = isSplatVector(V2.getNode()); 7403 7404 // Canonicalize the splat or undef, if present, to be on the RHS. 7405 if (!V2IsUndef && V1IsSplat && !V2IsSplat) { 7406 CommuteVectorShuffleMask(M, NumElems); 7407 std::swap(V1, V2); 7408 std::swap(V1IsSplat, V2IsSplat); 7409 Commuted = true; 7410 } 7411 7412 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { 7413 // Shuffling low element of v1 into undef, just return v1. 7414 if (V2IsUndef) 7415 return V1; 7416 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 7417 // the instruction selector will not match, so get a canonical MOVL with 7418 // swapped operands to undo the commute. 7419 return getMOVL(DAG, dl, VT, V2, V1); 7420 } 7421 7422 if (isUNPCKLMask(M, VT, HasInt256)) 7423 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 7424 7425 if (isUNPCKHMask(M, VT, HasInt256)) 7426 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 7427 7428 if (V2IsSplat) { 7429 // Normalize mask so all entries that point to V2 points to its first 7430 // element then try to match unpck{h|l} again. If match, return a 7431 // new vector_shuffle with the corrected mask.p 7432 SmallVector<int, 8> NewMask(M.begin(), M.end()); 7433 NormalizeMask(NewMask, NumElems); 7434 if (isUNPCKLMask(NewMask, VT, HasInt256, true)) 7435 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 7436 if (isUNPCKHMask(NewMask, VT, HasInt256, true)) 7437 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 7438 } 7439 7440 if (Commuted) { 7441 // Commute is back and try unpck* again. 7442 // FIXME: this seems wrong. 7443 CommuteVectorShuffleMask(M, NumElems); 7444 std::swap(V1, V2); 7445 std::swap(V1IsSplat, V2IsSplat); 7446 Commuted = false; 7447 7448 if (isUNPCKLMask(M, VT, HasInt256)) 7449 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 7450 7451 if (isUNPCKHMask(M, VT, HasInt256)) 7452 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 7453 } 7454 7455 // Normalize the node to match x86 shuffle ops if needed 7456 if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) 7457 return CommuteVectorShuffle(SVOp, DAG); 7458 7459 // The checks below are all present in isShuffleMaskLegal, but they are 7460 // inlined here right now to enable us to directly emit target specific 7461 // nodes, and remove one by one until they don't return Op anymore. 7462 7463 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 7464 SVOp->getSplatIndex() == 0 && V2IsUndef) { 7465 if (VT == MVT::v2f64 || VT == MVT::v2i64) 7466 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 7467 } 7468 7469 if (isPSHUFHWMask(M, VT, HasInt256)) 7470 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 7471 getShufflePSHUFHWImmediate(SVOp), 7472 DAG); 7473 7474 if (isPSHUFLWMask(M, VT, HasInt256)) 7475 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 7476 getShufflePSHUFLWImmediate(SVOp), 7477 DAG); 7478 7479 if (isSHUFPMask(M, VT)) 7480 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, 7481 getShuffleSHUFImmediate(SVOp), DAG); 7482 7483 if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 7484 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 7485 if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 7486 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 7487 7488 //===--------------------------------------------------------------------===// 7489 // Generate target specific nodes for 128 or 256-bit shuffles only 7490 // supported in the AVX instruction set. 7491 // 7492 7493 // Handle VMOVDDUPY permutations 7494 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) 7495 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 7496 7497 // Handle VPERMILPS/D* permutations 7498 if (isVPERMILPMask(M, VT)) { 7499 if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) 7500 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, 7501 getShuffleSHUFImmediate(SVOp), DAG); 7502 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, 7503 getShuffleSHUFImmediate(SVOp), DAG); 7504 } 7505 7506 // Handle VPERM2F128/VPERM2I128 permutations 7507 if (isVPERM2X128Mask(M, VT, HasFp256)) 7508 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, 7509 V2, getShuffleVPERM2X128Immediate(SVOp), DAG); 7510 7511 SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); 7512 if (BlendOp.getNode()) 7513 return BlendOp; 7514 7515 unsigned Imm8; 7516 if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) 7517 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); 7518 7519 if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) || 7520 VT.is512BitVector()) { 7521 MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits()); 7522 MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems); 7523 SmallVector<SDValue, 16> permclMask; 7524 for (unsigned i = 0; i != NumElems; ++i) { 7525 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); 7526 } 7527 7528 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, 7529 &permclMask[0], NumElems); 7530 if (V2IsUndef) 7531 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 7532 return DAG.getNode(X86ISD::VPERMV, dl, VT, 7533 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); 7534 return DAG.getNode(X86ISD::VPERMV3, dl, VT, 7535 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2); 7536 } 7537 7538 //===--------------------------------------------------------------------===// 7539 // Since no target specific shuffle was selected for this generic one, 7540 // lower it into other known shuffles. FIXME: this isn't true yet, but 7541 // this is the plan. 7542 // 7543 7544 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 7545 if (VT == MVT::v8i16) { 7546 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); 7547 if (NewOp.getNode()) 7548 return NewOp; 7549 } 7550 7551 if (VT == MVT::v16i8) { 7552 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG); 7553 if (NewOp.getNode()) 7554 return NewOp; 7555 } 7556 7557 if (VT == MVT::v32i8) { 7558 SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); 7559 if (NewOp.getNode()) 7560 return NewOp; 7561 } 7562 7563 // Handle all 128-bit wide vectors with 4 elements, and match them with 7564 // several different shuffle types. 7565 if (NumElems == 4 && VT.is128BitVector()) 7566 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 7567 7568 // Handle general 256-bit shuffles 7569 if (VT.is256BitVector()) 7570 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 7571 7572 return SDValue(); 7573} 7574 7575static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 7576 MVT VT = Op.getSimpleValueType(); 7577 SDLoc dl(Op); 7578 7579 if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) 7580 return SDValue(); 7581 7582 if (VT.getSizeInBits() == 8) { 7583 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 7584 Op.getOperand(0), Op.getOperand(1)); 7585 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 7586 DAG.getValueType(VT)); 7587 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7588 } 7589 7590 if (VT.getSizeInBits() == 16) { 7591 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7592 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 7593 if (Idx == 0) 7594 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 7595 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7596 DAG.getNode(ISD::BITCAST, dl, 7597 MVT::v4i32, 7598 Op.getOperand(0)), 7599 Op.getOperand(1))); 7600 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 7601 Op.getOperand(0), Op.getOperand(1)); 7602 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 7603 DAG.getValueType(VT)); 7604 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7605 } 7606 7607 if (VT == MVT::f32) { 7608 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 7609 // the result back to FR32 register. It's only worth matching if the 7610 // result has a single use which is a store or a bitcast to i32. And in 7611 // the case of a store, it's not worth it if the index is a constant 0, 7612 // because a MOVSSmr can be used instead, which is smaller and faster. 7613 if (!Op.hasOneUse()) 7614 return SDValue(); 7615 SDNode *User = *Op.getNode()->use_begin(); 7616 if ((User->getOpcode() != ISD::STORE || 7617 (isa<ConstantSDNode>(Op.getOperand(1)) && 7618 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 7619 (User->getOpcode() != ISD::BITCAST || 7620 User->getValueType(0) != MVT::i32)) 7621 return SDValue(); 7622 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7623 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 7624 Op.getOperand(0)), 7625 Op.getOperand(1)); 7626 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 7627 } 7628 7629 if (VT == MVT::i32 || VT == MVT::i64) { 7630 // ExtractPS/pextrq works with constant index. 7631 if (isa<ConstantSDNode>(Op.getOperand(1))) 7632 return Op; 7633 } 7634 return SDValue(); 7635} 7636 7637SDValue 7638X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 7639 SelectionDAG &DAG) const { 7640 SDLoc dl(Op); 7641 SDValue Vec = Op.getOperand(0); 7642 MVT VecVT = Vec.getSimpleValueType(); 7643 SDValue Idx = Op.getOperand(1); 7644 if (!isa<ConstantSDNode>(Idx)) { 7645 if (VecVT.is512BitVector() || 7646 (VecVT.is256BitVector() && Subtarget->hasInt256() && 7647 VecVT.getVectorElementType().getSizeInBits() == 32)) { 7648 7649 MVT MaskEltVT = 7650 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); 7651 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / 7652 MaskEltVT.getSizeInBits()); 7653 7654 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); 7655 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, 7656 getZeroVector(MaskVT, Subtarget, DAG, dl), 7657 Idx, DAG.getConstant(0, getPointerTy())); 7658 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); 7659 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), 7660 Perm, DAG.getConstant(0, getPointerTy())); 7661 } 7662 return SDValue(); 7663 } 7664 7665 // If this is a 256-bit vector result, first extract the 128-bit vector and 7666 // then extract the element from the 128-bit vector. 7667 if (VecVT.is256BitVector() || VecVT.is512BitVector()) { 7668 7669 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7670 // Get the 128-bit vector. 7671 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); 7672 MVT EltVT = VecVT.getVectorElementType(); 7673 7674 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); 7675 7676 //if (IdxVal >= NumElems/2) 7677 // IdxVal -= NumElems/2; 7678 IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; 7679 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 7680 DAG.getConstant(IdxVal, MVT::i32)); 7681 } 7682 7683 assert(VecVT.is128BitVector() && "Unexpected vector length"); 7684 7685 if (Subtarget->hasSSE41()) { 7686 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 7687 if (Res.getNode()) 7688 return Res; 7689 } 7690 7691 MVT VT = Op.getSimpleValueType(); 7692 // TODO: handle v16i8. 7693 if (VT.getSizeInBits() == 16) { 7694 SDValue Vec = Op.getOperand(0); 7695 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7696 if (Idx == 0) 7697 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 7698 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7699 DAG.getNode(ISD::BITCAST, dl, 7700 MVT::v4i32, Vec), 7701 Op.getOperand(1))); 7702 // Transform it so it match pextrw which produces a 32-bit result. 7703 MVT EltVT = MVT::i32; 7704 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 7705 Op.getOperand(0), Op.getOperand(1)); 7706 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 7707 DAG.getValueType(VT)); 7708 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7709 } 7710 7711 if (VT.getSizeInBits() == 32) { 7712 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7713 if (Idx == 0) 7714 return Op; 7715 7716 // SHUFPS the element to the lowest double word, then movss. 7717 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 7718 MVT VVT = Op.getOperand(0).getSimpleValueType(); 7719 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7720 DAG.getUNDEF(VVT), Mask); 7721 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7722 DAG.getIntPtrConstant(0)); 7723 } 7724 7725 if (VT.getSizeInBits() == 64) { 7726 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 7727 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 7728 // to match extract_elt for f64. 7729 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7730 if (Idx == 0) 7731 return Op; 7732 7733 // UNPCKHPD the element to the lowest double word, then movsd. 7734 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 7735 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 7736 int Mask[2] = { 1, -1 }; 7737 MVT VVT = Op.getOperand(0).getSimpleValueType(); 7738 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7739 DAG.getUNDEF(VVT), Mask); 7740 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7741 DAG.getIntPtrConstant(0)); 7742 } 7743 7744 return SDValue(); 7745} 7746 7747static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 7748 MVT VT = Op.getSimpleValueType(); 7749 MVT EltVT = VT.getVectorElementType(); 7750 SDLoc dl(Op); 7751 7752 SDValue N0 = Op.getOperand(0); 7753 SDValue N1 = Op.getOperand(1); 7754 SDValue N2 = Op.getOperand(2); 7755 7756 if (!VT.is128BitVector()) 7757 return SDValue(); 7758 7759 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 7760 isa<ConstantSDNode>(N2)) { 7761 unsigned Opc; 7762 if (VT == MVT::v8i16) 7763 Opc = X86ISD::PINSRW; 7764 else if (VT == MVT::v16i8) 7765 Opc = X86ISD::PINSRB; 7766 else 7767 Opc = X86ISD::PINSRB; 7768 7769 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 7770 // argument. 7771 if (N1.getValueType() != MVT::i32) 7772 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7773 if (N2.getValueType() != MVT::i32) 7774 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7775 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 7776 } 7777 7778 if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 7779 // Bits [7:6] of the constant are the source select. This will always be 7780 // zero here. The DAG Combiner may combine an extract_elt index into these 7781 // bits. For example (insert (extract, 3), 2) could be matched by putting 7782 // the '3' into bits [7:6] of X86ISD::INSERTPS. 7783 // Bits [5:4] of the constant are the destination select. This is the 7784 // value of the incoming immediate. 7785 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 7786 // combine either bitwise AND or insert of float 0.0 to set these bits. 7787 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 7788 // Create this as a scalar to vector.. 7789 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 7790 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 7791 } 7792 7793 if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { 7794 // PINSR* works with constant index. 7795 return Op; 7796 } 7797 return SDValue(); 7798} 7799 7800SDValue 7801X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 7802 MVT VT = Op.getSimpleValueType(); 7803 MVT EltVT = VT.getVectorElementType(); 7804 7805 SDLoc dl(Op); 7806 SDValue N0 = Op.getOperand(0); 7807 SDValue N1 = Op.getOperand(1); 7808 SDValue N2 = Op.getOperand(2); 7809 7810 // If this is a 256-bit vector result, first extract the 128-bit vector, 7811 // insert the element into the extracted half and then place it back. 7812 if (VT.is256BitVector() || VT.is512BitVector()) { 7813 if (!isa<ConstantSDNode>(N2)) 7814 return SDValue(); 7815 7816 // Get the desired 128-bit vector half. 7817 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 7818 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); 7819 7820 // Insert the element into the desired half. 7821 unsigned NumEltsIn128 = 128/EltVT.getSizeInBits(); 7822 unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128; 7823 7824 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 7825 DAG.getConstant(IdxIn128, MVT::i32)); 7826 7827 // Insert the changed part back to the 256-bit vector 7828 return Insert128BitVector(N0, V, IdxVal, DAG, dl); 7829 } 7830 7831 if (Subtarget->hasSSE41()) 7832 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 7833 7834 if (EltVT == MVT::i8) 7835 return SDValue(); 7836 7837 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 7838 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 7839 // as its second argument. 7840 if (N1.getValueType() != MVT::i32) 7841 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7842 if (N2.getValueType() != MVT::i32) 7843 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7844 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 7845 } 7846 return SDValue(); 7847} 7848 7849static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 7850 SDLoc dl(Op); 7851 MVT OpVT = Op.getSimpleValueType(); 7852 7853 // If this is a 256-bit vector result, first insert into a 128-bit 7854 // vector and then insert into the 256-bit vector. 7855 if (!OpVT.is128BitVector()) { 7856 // Insert into a 128-bit vector. 7857 unsigned SizeFactor = OpVT.getSizeInBits()/128; 7858 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), 7859 OpVT.getVectorNumElements() / SizeFactor); 7860 7861 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 7862 7863 // Insert the 128-bit vector. 7864 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 7865 } 7866 7867 if (OpVT == MVT::v1i64 && 7868 Op.getOperand(0).getValueType() == MVT::i64) 7869 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 7870 7871 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 7872 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 7873 return DAG.getNode(ISD::BITCAST, dl, OpVT, 7874 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 7875} 7876 7877// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 7878// a simple subregister reference or explicit instructions to grab 7879// upper bits of a vector. 7880static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7881 SelectionDAG &DAG) { 7882 SDLoc dl(Op); 7883 SDValue In = Op.getOperand(0); 7884 SDValue Idx = Op.getOperand(1); 7885 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7886 MVT ResVT = Op.getSimpleValueType(); 7887 MVT InVT = In.getSimpleValueType(); 7888 7889 if (Subtarget->hasFp256()) { 7890 if (ResVT.is128BitVector() && 7891 (InVT.is256BitVector() || InVT.is512BitVector()) && 7892 isa<ConstantSDNode>(Idx)) { 7893 return Extract128BitVector(In, IdxVal, DAG, dl); 7894 } 7895 if (ResVT.is256BitVector() && InVT.is512BitVector() && 7896 isa<ConstantSDNode>(Idx)) { 7897 return Extract256BitVector(In, IdxVal, DAG, dl); 7898 } 7899 } 7900 return SDValue(); 7901} 7902 7903// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 7904// simple superregister reference or explicit instructions to insert 7905// the upper bits of a vector. 7906static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7907 SelectionDAG &DAG) { 7908 if (Subtarget->hasFp256()) { 7909 SDLoc dl(Op.getNode()); 7910 SDValue Vec = Op.getNode()->getOperand(0); 7911 SDValue SubVec = Op.getNode()->getOperand(1); 7912 SDValue Idx = Op.getNode()->getOperand(2); 7913 7914 if ((Op.getNode()->getSimpleValueType(0).is256BitVector() || 7915 Op.getNode()->getSimpleValueType(0).is512BitVector()) && 7916 SubVec.getNode()->getSimpleValueType(0).is128BitVector() && 7917 isa<ConstantSDNode>(Idx)) { 7918 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7919 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 7920 } 7921 7922 if (Op.getNode()->getSimpleValueType(0).is512BitVector() && 7923 SubVec.getNode()->getSimpleValueType(0).is256BitVector() && 7924 isa<ConstantSDNode>(Idx)) { 7925 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7926 return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); 7927 } 7928 } 7929 return SDValue(); 7930} 7931 7932// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 7933// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 7934// one of the above mentioned nodes. It has to be wrapped because otherwise 7935// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 7936// be used to form addressing mode. These wrapped nodes will be selected 7937// into MOV32ri. 7938SDValue 7939X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 7940 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 7941 7942 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7943 // global base reg. 7944 unsigned char OpFlag = 0; 7945 unsigned WrapperKind = X86ISD::Wrapper; 7946 CodeModel::Model M = getTargetMachine().getCodeModel(); 7947 7948 if (Subtarget->isPICStyleRIPRel() && 7949 (M == CodeModel::Small || M == CodeModel::Kernel)) 7950 WrapperKind = X86ISD::WrapperRIP; 7951 else if (Subtarget->isPICStyleGOT()) 7952 OpFlag = X86II::MO_GOTOFF; 7953 else if (Subtarget->isPICStyleStubPIC()) 7954 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7955 7956 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 7957 CP->getAlignment(), 7958 CP->getOffset(), OpFlag); 7959 SDLoc DL(CP); 7960 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7961 // With PIC, the address is actually $g + Offset. 7962 if (OpFlag) { 7963 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7964 DAG.getNode(X86ISD::GlobalBaseReg, 7965 SDLoc(), getPointerTy()), 7966 Result); 7967 } 7968 7969 return Result; 7970} 7971 7972SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 7973 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 7974 7975 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7976 // global base reg. 7977 unsigned char OpFlag = 0; 7978 unsigned WrapperKind = X86ISD::Wrapper; 7979 CodeModel::Model M = getTargetMachine().getCodeModel(); 7980 7981 if (Subtarget->isPICStyleRIPRel() && 7982 (M == CodeModel::Small || M == CodeModel::Kernel)) 7983 WrapperKind = X86ISD::WrapperRIP; 7984 else if (Subtarget->isPICStyleGOT()) 7985 OpFlag = X86II::MO_GOTOFF; 7986 else if (Subtarget->isPICStyleStubPIC()) 7987 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7988 7989 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 7990 OpFlag); 7991 SDLoc DL(JT); 7992 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7993 7994 // With PIC, the address is actually $g + Offset. 7995 if (OpFlag) 7996 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7997 DAG.getNode(X86ISD::GlobalBaseReg, 7998 SDLoc(), getPointerTy()), 7999 Result); 8000 8001 return Result; 8002} 8003 8004SDValue 8005X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 8006 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 8007 8008 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 8009 // global base reg. 8010 unsigned char OpFlag = 0; 8011 unsigned WrapperKind = X86ISD::Wrapper; 8012 CodeModel::Model M = getTargetMachine().getCodeModel(); 8013 8014 if (Subtarget->isPICStyleRIPRel() && 8015 (M == CodeModel::Small || M == CodeModel::Kernel)) { 8016 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 8017 OpFlag = X86II::MO_GOTPCREL; 8018 WrapperKind = X86ISD::WrapperRIP; 8019 } else if (Subtarget->isPICStyleGOT()) { 8020 OpFlag = X86II::MO_GOT; 8021 } else if (Subtarget->isPICStyleStubPIC()) { 8022 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 8023 } else if (Subtarget->isPICStyleStubNoDynamic()) { 8024 OpFlag = X86II::MO_DARWIN_NONLAZY; 8025 } 8026 8027 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 8028 8029 SDLoc DL(Op); 8030 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 8031 8032 // With PIC, the address is actually $g + Offset. 8033 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 8034 !Subtarget->is64Bit()) { 8035 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8036 DAG.getNode(X86ISD::GlobalBaseReg, 8037 SDLoc(), getPointerTy()), 8038 Result); 8039 } 8040 8041 // For symbols that require a load from a stub to get the address, emit the 8042 // load. 8043 if (isGlobalStubReference(OpFlag)) 8044 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 8045 MachinePointerInfo::getGOT(), false, false, false, 0); 8046 8047 return Result; 8048} 8049 8050SDValue 8051X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 8052 // Create the TargetBlockAddressAddress node. 8053 unsigned char OpFlags = 8054 Subtarget->ClassifyBlockAddressReference(); 8055 CodeModel::Model M = getTargetMachine().getCodeModel(); 8056 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 8057 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); 8058 SDLoc dl(Op); 8059 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, 8060 OpFlags); 8061 8062 if (Subtarget->isPICStyleRIPRel() && 8063 (M == CodeModel::Small || M == CodeModel::Kernel)) 8064 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 8065 else 8066 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 8067 8068 // With PIC, the address is actually $g + Offset. 8069 if (isGlobalRelativeToPICBase(OpFlags)) { 8070 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 8071 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 8072 Result); 8073 } 8074 8075 return Result; 8076} 8077 8078SDValue 8079X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, 8080 int64_t Offset, SelectionDAG &DAG) const { 8081 // Create the TargetGlobalAddress node, folding in the constant 8082 // offset if it is legal. 8083 unsigned char OpFlags = 8084 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 8085 CodeModel::Model M = getTargetMachine().getCodeModel(); 8086 SDValue Result; 8087 if (OpFlags == X86II::MO_NO_FLAG && 8088 X86::isOffsetSuitableForCodeModel(Offset, M)) { 8089 // A direct static reference to a global. 8090 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 8091 Offset = 0; 8092 } else { 8093 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 8094 } 8095 8096 if (Subtarget->isPICStyleRIPRel() && 8097 (M == CodeModel::Small || M == CodeModel::Kernel)) 8098 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 8099 else 8100 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 8101 8102 // With PIC, the address is actually $g + Offset. 8103 if (isGlobalRelativeToPICBase(OpFlags)) { 8104 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 8105 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 8106 Result); 8107 } 8108 8109 // For globals that require a load from a stub to get the address, emit the 8110 // load. 8111 if (isGlobalStubReference(OpFlags)) 8112 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 8113 MachinePointerInfo::getGOT(), false, false, false, 0); 8114 8115 // If there was a non-zero offset that we didn't fold, create an explicit 8116 // addition for it. 8117 if (Offset != 0) 8118 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 8119 DAG.getConstant(Offset, getPointerTy())); 8120 8121 return Result; 8122} 8123 8124SDValue 8125X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 8126 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 8127 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 8128 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); 8129} 8130 8131static SDValue 8132GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 8133 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 8134 unsigned char OperandFlags, bool LocalDynamic = false) { 8135 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8136 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8137 SDLoc dl(GA); 8138 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 8139 GA->getValueType(0), 8140 GA->getOffset(), 8141 OperandFlags); 8142 8143 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR 8144 : X86ISD::TLSADDR; 8145 8146 if (InFlag) { 8147 SDValue Ops[] = { Chain, TGA, *InFlag }; 8148 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); 8149 } else { 8150 SDValue Ops[] = { Chain, TGA }; 8151 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops)); 8152 } 8153 8154 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 8155 MFI->setAdjustsStack(true); 8156 8157 SDValue Flag = Chain.getValue(1); 8158 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 8159} 8160 8161// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 8162static SDValue 8163LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 8164 const EVT PtrVT) { 8165 SDValue InFlag; 8166 SDLoc dl(GA); // ? function entry point might be better 8167 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 8168 DAG.getNode(X86ISD::GlobalBaseReg, 8169 SDLoc(), PtrVT), InFlag); 8170 InFlag = Chain.getValue(1); 8171 8172 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 8173} 8174 8175// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 8176static SDValue 8177LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 8178 const EVT PtrVT) { 8179 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 8180 X86::RAX, X86II::MO_TLSGD); 8181} 8182 8183static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, 8184 SelectionDAG &DAG, 8185 const EVT PtrVT, 8186 bool is64Bit) { 8187 SDLoc dl(GA); 8188 8189 // Get the start address of the TLS block for this module. 8190 X86MachineFunctionInfo* MFI = DAG.getMachineFunction() 8191 .getInfo<X86MachineFunctionInfo>(); 8192 MFI->incNumLocalDynamicTLSAccesses(); 8193 8194 SDValue Base; 8195 if (is64Bit) { 8196 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, 8197 X86II::MO_TLSLD, /*LocalDynamic=*/true); 8198 } else { 8199 SDValue InFlag; 8200 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 8201 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); 8202 InFlag = Chain.getValue(1); 8203 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, 8204 X86II::MO_TLSLDM, /*LocalDynamic=*/true); 8205 } 8206 8207 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations 8208 // of Base. 8209 8210 // Build x@dtpoff. 8211 unsigned char OperandFlags = X86II::MO_DTPOFF; 8212 unsigned WrapperKind = X86ISD::Wrapper; 8213 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 8214 GA->getValueType(0), 8215 GA->getOffset(), OperandFlags); 8216 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 8217 8218 // Add x@dtpoff with the base. 8219 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); 8220} 8221 8222// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. 8223static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 8224 const EVT PtrVT, TLSModel::Model model, 8225 bool is64Bit, bool isPIC) { 8226 SDLoc dl(GA); 8227 8228 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 8229 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 8230 is64Bit ? 257 : 256)); 8231 8232 SDValue ThreadPointer = 8233 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), 8234 MachinePointerInfo(Ptr), false, false, false, 0); 8235 8236 unsigned char OperandFlags = 0; 8237 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 8238 // initialexec. 8239 unsigned WrapperKind = X86ISD::Wrapper; 8240 if (model == TLSModel::LocalExec) { 8241 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 8242 } else if (model == TLSModel::InitialExec) { 8243 if (is64Bit) { 8244 OperandFlags = X86II::MO_GOTTPOFF; 8245 WrapperKind = X86ISD::WrapperRIP; 8246 } else { 8247 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; 8248 } 8249 } else { 8250 llvm_unreachable("Unexpected model"); 8251 } 8252 8253 // emit "addl x@ntpoff,%eax" (local exec) 8254 // or "addl x@indntpoff,%eax" (initial exec) 8255 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) 8256 SDValue TGA = 8257 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), 8258 GA->getOffset(), OperandFlags); 8259 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 8260 8261 if (model == TLSModel::InitialExec) { 8262 if (isPIC && !is64Bit) { 8263 Offset = DAG.getNode(ISD::ADD, dl, PtrVT, 8264 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), 8265 Offset); 8266 } 8267 8268 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 8269 MachinePointerInfo::getGOT(), false, false, false, 0); 8270 } 8271 8272 // The address of the thread local variable is the add of the thread 8273 // pointer with the offset of the variable. 8274 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 8275} 8276 8277SDValue 8278X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 8279 8280 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 8281 const GlobalValue *GV = GA->getGlobal(); 8282 8283 if (Subtarget->isTargetELF()) { 8284 TLSModel::Model model = getTargetMachine().getTLSModel(GV); 8285 8286 switch (model) { 8287 case TLSModel::GeneralDynamic: 8288 if (Subtarget->is64Bit()) 8289 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 8290 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 8291 case TLSModel::LocalDynamic: 8292 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), 8293 Subtarget->is64Bit()); 8294 case TLSModel::InitialExec: 8295 case TLSModel::LocalExec: 8296 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 8297 Subtarget->is64Bit(), 8298 getTargetMachine().getRelocationModel() == Reloc::PIC_); 8299 } 8300 llvm_unreachable("Unknown TLS model."); 8301 } 8302 8303 if (Subtarget->isTargetDarwin()) { 8304 // Darwin only has one model of TLS. Lower to that. 8305 unsigned char OpFlag = 0; 8306 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 8307 X86ISD::WrapperRIP : X86ISD::Wrapper; 8308 8309 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 8310 // global base reg. 8311 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 8312 !Subtarget->is64Bit(); 8313 if (PIC32) 8314 OpFlag = X86II::MO_TLVP_PIC_BASE; 8315 else 8316 OpFlag = X86II::MO_TLVP; 8317 SDLoc DL(Op); 8318 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 8319 GA->getValueType(0), 8320 GA->getOffset(), OpFlag); 8321 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 8322 8323 // With PIC32, the address is actually $g + Offset. 8324 if (PIC32) 8325 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8326 DAG.getNode(X86ISD::GlobalBaseReg, 8327 SDLoc(), getPointerTy()), 8328 Offset); 8329 8330 // Lowering the machine isd will make sure everything is in the right 8331 // location. 8332 SDValue Chain = DAG.getEntryNode(); 8333 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8334 SDValue Args[] = { Chain, Offset }; 8335 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 8336 8337 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 8338 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8339 MFI->setAdjustsStack(true); 8340 8341 // And our return value (tls address) is in the standard call return value 8342 // location. 8343 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 8344 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), 8345 Chain.getValue(1)); 8346 } 8347 8348 if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) { 8349 // Just use the implicit TLS architecture 8350 // Need to generate someting similar to: 8351 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 8352 // ; from TEB 8353 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 8354 // mov rcx, qword [rdx+rcx*8] 8355 // mov eax, .tls$:tlsvar 8356 // [rax+rcx] contains the address 8357 // Windows 64bit: gs:0x58 8358 // Windows 32bit: fs:__tls_array 8359 8360 // If GV is an alias then use the aliasee for determining 8361 // thread-localness. 8362 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 8363 GV = GA->resolveAliasedGlobal(false); 8364 SDLoc dl(GA); 8365 SDValue Chain = DAG.getEntryNode(); 8366 8367 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 8368 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly 8369 // use its literal value of 0x2C. 8370 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() 8371 ? Type::getInt8PtrTy(*DAG.getContext(), 8372 256) 8373 : Type::getInt32PtrTy(*DAG.getContext(), 8374 257)); 8375 8376 SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) : 8377 (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) : 8378 DAG.getExternalSymbol("_tls_array", getPointerTy())); 8379 8380 SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, 8381 MachinePointerInfo(Ptr), 8382 false, false, false, 0); 8383 8384 // Load the _tls_index variable 8385 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); 8386 if (Subtarget->is64Bit()) 8387 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, 8388 IDX, MachinePointerInfo(), MVT::i32, 8389 false, false, 0); 8390 else 8391 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), 8392 false, false, false, 0); 8393 8394 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), 8395 getPointerTy()); 8396 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); 8397 8398 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); 8399 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), 8400 false, false, false, 0); 8401 8402 // Get the offset of start of .tls section 8403 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 8404 GA->getValueType(0), 8405 GA->getOffset(), X86II::MO_SECREL); 8406 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); 8407 8408 // The address of the thread local variable is the add of the thread 8409 // pointer with the offset of the variable. 8410 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); 8411 } 8412 8413 llvm_unreachable("TLS not implemented for this target."); 8414} 8415 8416/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values 8417/// and take a 2 x i32 value to shift plus a shift amount. 8418SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ 8419 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 8420 EVT VT = Op.getValueType(); 8421 unsigned VTBits = VT.getSizeInBits(); 8422 SDLoc dl(Op); 8423 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 8424 SDValue ShOpLo = Op.getOperand(0); 8425 SDValue ShOpHi = Op.getOperand(1); 8426 SDValue ShAmt = Op.getOperand(2); 8427 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 8428 DAG.getConstant(VTBits - 1, MVT::i8)) 8429 : DAG.getConstant(0, VT); 8430 8431 SDValue Tmp2, Tmp3; 8432 if (Op.getOpcode() == ISD::SHL_PARTS) { 8433 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 8434 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 8435 } else { 8436 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 8437 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 8438 } 8439 8440 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 8441 DAG.getConstant(VTBits, MVT::i8)); 8442 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 8443 AndNode, DAG.getConstant(0, MVT::i8)); 8444 8445 SDValue Hi, Lo; 8446 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8447 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 8448 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 8449 8450 if (Op.getOpcode() == ISD::SHL_PARTS) { 8451 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 8452 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 8453 } else { 8454 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 8455 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 8456 } 8457 8458 SDValue Ops[2] = { Lo, Hi }; 8459 return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); 8460} 8461 8462SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 8463 SelectionDAG &DAG) const { 8464 EVT SrcVT = Op.getOperand(0).getValueType(); 8465 8466 if (SrcVT.isVector()) 8467 return SDValue(); 8468 8469 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 8470 "Unknown SINT_TO_FP to lower!"); 8471 8472 // These are really Legal; return the operand so the caller accepts it as 8473 // Legal. 8474 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 8475 return Op; 8476 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 8477 Subtarget->is64Bit()) { 8478 return Op; 8479 } 8480 8481 SDLoc dl(Op); 8482 unsigned Size = SrcVT.getSizeInBits()/8; 8483 MachineFunction &MF = DAG.getMachineFunction(); 8484 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 8485 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8486 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8487 StackSlot, 8488 MachinePointerInfo::getFixedStack(SSFI), 8489 false, false, 0); 8490 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 8491} 8492 8493SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 8494 SDValue StackSlot, 8495 SelectionDAG &DAG) const { 8496 // Build the FILD 8497 SDLoc DL(Op); 8498 SDVTList Tys; 8499 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 8500 if (useSSE) 8501 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 8502 else 8503 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 8504 8505 unsigned ByteSize = SrcVT.getSizeInBits()/8; 8506 8507 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 8508 MachineMemOperand *MMO; 8509 if (FI) { 8510 int SSFI = FI->getIndex(); 8511 MMO = 8512 DAG.getMachineFunction() 8513 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8514 MachineMemOperand::MOLoad, ByteSize, ByteSize); 8515 } else { 8516 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 8517 StackSlot = StackSlot.getOperand(1); 8518 } 8519 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 8520 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 8521 X86ISD::FILD, DL, 8522 Tys, Ops, array_lengthof(Ops), 8523 SrcVT, MMO); 8524 8525 if (useSSE) { 8526 Chain = Result.getValue(1); 8527 SDValue InFlag = Result.getValue(2); 8528 8529 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 8530 // shouldn't be necessary except that RFP cannot be live across 8531 // multiple blocks. When stackifier is fixed, they can be uncoupled. 8532 MachineFunction &MF = DAG.getMachineFunction(); 8533 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 8534 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 8535 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8536 Tys = DAG.getVTList(MVT::Other); 8537 SDValue Ops[] = { 8538 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 8539 }; 8540 MachineMemOperand *MMO = 8541 DAG.getMachineFunction() 8542 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8543 MachineMemOperand::MOStore, SSFISize, SSFISize); 8544 8545 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 8546 Ops, array_lengthof(Ops), 8547 Op.getValueType(), MMO); 8548 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 8549 MachinePointerInfo::getFixedStack(SSFI), 8550 false, false, false, 0); 8551 } 8552 8553 return Result; 8554} 8555 8556// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 8557SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 8558 SelectionDAG &DAG) const { 8559 // This algorithm is not obvious. Here it is what we're trying to output: 8560 /* 8561 movq %rax, %xmm0 8562 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 8563 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 8564 #ifdef __SSE3__ 8565 haddpd %xmm0, %xmm0 8566 #else 8567 pshufd $0x4e, %xmm0, %xmm1 8568 addpd %xmm1, %xmm0 8569 #endif 8570 */ 8571 8572 SDLoc dl(Op); 8573 LLVMContext *Context = DAG.getContext(); 8574 8575 // Build some magic constants. 8576 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 8577 Constant *C0 = ConstantDataVector::get(*Context, CV0); 8578 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 8579 8580 SmallVector<Constant*,2> CV1; 8581 CV1.push_back( 8582 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 8583 APInt(64, 0x4330000000000000ULL)))); 8584 CV1.push_back( 8585 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 8586 APInt(64, 0x4530000000000000ULL)))); 8587 Constant *C1 = ConstantVector::get(CV1); 8588 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 8589 8590 // Load the 64-bit value into an XMM register. 8591 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 8592 Op.getOperand(0)); 8593 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 8594 MachinePointerInfo::getConstantPool(), 8595 false, false, false, 16); 8596 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, 8597 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), 8598 CLod0); 8599 8600 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 8601 MachinePointerInfo::getConstantPool(), 8602 false, false, false, 16); 8603 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); 8604 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 8605 SDValue Result; 8606 8607 if (Subtarget->hasSSE3()) { 8608 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 8609 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 8610 } else { 8611 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); 8612 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 8613 S2F, 0x4E, DAG); 8614 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 8615 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), 8616 Sub); 8617 } 8618 8619 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 8620 DAG.getIntPtrConstant(0)); 8621} 8622 8623// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 8624SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 8625 SelectionDAG &DAG) const { 8626 SDLoc dl(Op); 8627 // FP constant to bias correct the final result. 8628 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 8629 MVT::f64); 8630 8631 // Load the 32-bit value into an XMM register. 8632 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 8633 Op.getOperand(0)); 8634 8635 // Zero out the upper parts of the register. 8636 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 8637 8638 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 8639 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 8640 DAG.getIntPtrConstant(0)); 8641 8642 // Or the load with the bias. 8643 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 8644 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 8645 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 8646 MVT::v2f64, Load)), 8647 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 8648 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 8649 MVT::v2f64, Bias))); 8650 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 8651 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 8652 DAG.getIntPtrConstant(0)); 8653 8654 // Subtract the bias. 8655 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 8656 8657 // Handle final rounding. 8658 EVT DestVT = Op.getValueType(); 8659 8660 if (DestVT.bitsLT(MVT::f64)) 8661 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 8662 DAG.getIntPtrConstant(0)); 8663 if (DestVT.bitsGT(MVT::f64)) 8664 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 8665 8666 // Handle final rounding. 8667 return Sub; 8668} 8669 8670SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, 8671 SelectionDAG &DAG) const { 8672 SDValue N0 = Op.getOperand(0); 8673 EVT SVT = N0.getValueType(); 8674 SDLoc dl(Op); 8675 8676 assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || 8677 SVT == MVT::v8i8 || SVT == MVT::v8i16) && 8678 "Custom UINT_TO_FP is not supported!"); 8679 8680 EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 8681 SVT.getVectorNumElements()); 8682 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 8683 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); 8684} 8685 8686SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 8687 SelectionDAG &DAG) const { 8688 SDValue N0 = Op.getOperand(0); 8689 SDLoc dl(Op); 8690 8691 if (Op.getValueType().isVector()) 8692 return lowerUINT_TO_FP_vec(Op, DAG); 8693 8694 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 8695 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 8696 // the optimization here. 8697 if (DAG.SignBitIsZero(N0)) 8698 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 8699 8700 EVT SrcVT = N0.getValueType(); 8701 EVT DstVT = Op.getValueType(); 8702 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 8703 return LowerUINT_TO_FP_i64(Op, DAG); 8704 if (SrcVT == MVT::i32 && X86ScalarSSEf64) 8705 return LowerUINT_TO_FP_i32(Op, DAG); 8706 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) 8707 return SDValue(); 8708 8709 // Make a 64-bit buffer, and use it to build an FILD. 8710 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 8711 if (SrcVT == MVT::i32) { 8712 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 8713 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 8714 getPointerTy(), StackSlot, WordOff); 8715 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8716 StackSlot, MachinePointerInfo(), 8717 false, false, 0); 8718 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 8719 OffsetSlot, MachinePointerInfo(), 8720 false, false, 0); 8721 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 8722 return Fild; 8723 } 8724 8725 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 8726 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8727 StackSlot, MachinePointerInfo(), 8728 false, false, 0); 8729 // For i64 source, we need to add the appropriate power of 2 if the input 8730 // was negative. This is the same as the optimization in 8731 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 8732 // we must be careful to do the computation in x87 extended precision, not 8733 // in SSE. (The generic code can't know it's OK to do this, or how to.) 8734 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 8735 MachineMemOperand *MMO = 8736 DAG.getMachineFunction() 8737 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8738 MachineMemOperand::MOLoad, 8, 8); 8739 8740 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 8741 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 8742 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 8743 array_lengthof(Ops), MVT::i64, MMO); 8744 8745 APInt FF(32, 0x5F800000ULL); 8746 8747 // Check whether the sign bit is set. 8748 SDValue SignSet = DAG.getSetCC(dl, 8749 getSetCCResultType(*DAG.getContext(), MVT::i64), 8750 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 8751 ISD::SETLT); 8752 8753 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 8754 SDValue FudgePtr = DAG.getConstantPool( 8755 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 8756 getPointerTy()); 8757 8758 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 8759 SDValue Zero = DAG.getIntPtrConstant(0); 8760 SDValue Four = DAG.getIntPtrConstant(4); 8761 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 8762 Zero, Four); 8763 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 8764 8765 // Load the value out, extending it from f32 to f80. 8766 // FIXME: Avoid the extend by constructing the right constant pool? 8767 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 8768 FudgePtr, MachinePointerInfo::getConstantPool(), 8769 MVT::f32, false, false, 4); 8770 // Extend everything to 80 bits to force it to be done on x87. 8771 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 8772 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 8773} 8774 8775std::pair<SDValue,SDValue> 8776X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, 8777 bool IsSigned, bool IsReplace) const { 8778 SDLoc DL(Op); 8779 8780 EVT DstTy = Op.getValueType(); 8781 8782 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { 8783 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 8784 DstTy = MVT::i64; 8785 } 8786 8787 assert(DstTy.getSimpleVT() <= MVT::i64 && 8788 DstTy.getSimpleVT() >= MVT::i16 && 8789 "Unknown FP_TO_INT to lower!"); 8790 8791 // These are really Legal. 8792 if (DstTy == MVT::i32 && 8793 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8794 return std::make_pair(SDValue(), SDValue()); 8795 if (Subtarget->is64Bit() && 8796 DstTy == MVT::i64 && 8797 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8798 return std::make_pair(SDValue(), SDValue()); 8799 8800 // We lower FP->int64 either into FISTP64 followed by a load from a temporary 8801 // stack slot, or into the FTOL runtime function. 8802 MachineFunction &MF = DAG.getMachineFunction(); 8803 unsigned MemSize = DstTy.getSizeInBits()/8; 8804 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8805 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8806 8807 unsigned Opc; 8808 if (!IsSigned && isIntegerTypeFTOL(DstTy)) 8809 Opc = X86ISD::WIN_FTOL; 8810 else 8811 switch (DstTy.getSimpleVT().SimpleTy) { 8812 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 8813 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 8814 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 8815 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 8816 } 8817 8818 SDValue Chain = DAG.getEntryNode(); 8819 SDValue Value = Op.getOperand(0); 8820 EVT TheVT = Op.getOperand(0).getValueType(); 8821 // FIXME This causes a redundant load/store if the SSE-class value is already 8822 // in memory, such as if it is on the callstack. 8823 if (isScalarFPTypeInSSEReg(TheVT)) { 8824 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 8825 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 8826 MachinePointerInfo::getFixedStack(SSFI), 8827 false, false, 0); 8828 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 8829 SDValue Ops[] = { 8830 Chain, StackSlot, DAG.getValueType(TheVT) 8831 }; 8832 8833 MachineMemOperand *MMO = 8834 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8835 MachineMemOperand::MOLoad, MemSize, MemSize); 8836 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 8837 array_lengthof(Ops), DstTy, MMO); 8838 Chain = Value.getValue(1); 8839 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8840 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8841 } 8842 8843 MachineMemOperand *MMO = 8844 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8845 MachineMemOperand::MOStore, MemSize, MemSize); 8846 8847 if (Opc != X86ISD::WIN_FTOL) { 8848 // Build the FP_TO_INT*_IN_MEM 8849 SDValue Ops[] = { Chain, Value, StackSlot }; 8850 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 8851 Ops, array_lengthof(Ops), DstTy, 8852 MMO); 8853 return std::make_pair(FIST, StackSlot); 8854 } else { 8855 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, 8856 DAG.getVTList(MVT::Other, MVT::Glue), 8857 Chain, Value); 8858 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, 8859 MVT::i32, ftol.getValue(1)); 8860 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, 8861 MVT::i32, eax.getValue(2)); 8862 SDValue Ops[] = { eax, edx }; 8863 SDValue pair = IsReplace 8864 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops)) 8865 : DAG.getMergeValues(Ops, array_lengthof(Ops), DL); 8866 return std::make_pair(pair, SDValue()); 8867 } 8868} 8869 8870static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, 8871 const X86Subtarget *Subtarget) { 8872 MVT VT = Op->getSimpleValueType(0); 8873 SDValue In = Op->getOperand(0); 8874 MVT InVT = In.getSimpleValueType(); 8875 SDLoc dl(Op); 8876 8877 // Optimize vectors in AVX mode: 8878 // 8879 // v8i16 -> v8i32 8880 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 8881 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 8882 // Concat upper and lower parts. 8883 // 8884 // v4i32 -> v4i64 8885 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 8886 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 8887 // Concat upper and lower parts. 8888 // 8889 8890 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && 8891 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && 8892 ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) 8893 return SDValue(); 8894 8895 if (Subtarget->hasInt256()) 8896 return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In); 8897 8898 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); 8899 SDValue Undef = DAG.getUNDEF(InVT); 8900 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; 8901 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 8902 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 8903 8904 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), 8905 VT.getVectorNumElements()/2); 8906 8907 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); 8908 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); 8909 8910 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 8911} 8912 8913static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, 8914 SelectionDAG &DAG) { 8915 MVT VT = Op->getValueType(0).getSimpleVT(); 8916 SDValue In = Op->getOperand(0); 8917 MVT InVT = In.getValueType().getSimpleVT(); 8918 SDLoc DL(Op); 8919 unsigned int NumElts = VT.getVectorNumElements(); 8920 if (NumElts != 8 && NumElts != 16) 8921 return SDValue(); 8922 8923 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 8924 return DAG.getNode(X86ISD::VZEXT, DL, VT, In); 8925 8926 EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32; 8927 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8928 // Now we have only mask extension 8929 assert(InVT.getVectorElementType() == MVT::i1); 8930 SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType()); 8931 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); 8932 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 8933 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 8934 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, 8935 MachinePointerInfo::getConstantPool(), 8936 false, false, false, Alignment); 8937 8938 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); 8939 if (VT.is512BitVector()) 8940 return Brcst; 8941 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); 8942} 8943 8944static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 8945 SelectionDAG &DAG) { 8946 if (Subtarget->hasFp256()) { 8947 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); 8948 if (Res.getNode()) 8949 return Res; 8950 } 8951 8952 return SDValue(); 8953} 8954 8955static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 8956 SelectionDAG &DAG) { 8957 SDLoc DL(Op); 8958 MVT VT = Op.getSimpleValueType(); 8959 SDValue In = Op.getOperand(0); 8960 MVT SVT = In.getSimpleValueType(); 8961 8962 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) 8963 return LowerZERO_EXTEND_AVX512(Op, DAG); 8964 8965 if (Subtarget->hasFp256()) { 8966 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); 8967 if (Res.getNode()) 8968 return Res; 8969 } 8970 8971 assert(!VT.is256BitVector() || !SVT.is128BitVector() || 8972 VT.getVectorNumElements() != SVT.getVectorNumElements()); 8973 return SDValue(); 8974} 8975 8976SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 8977 SDLoc DL(Op); 8978 MVT VT = Op.getSimpleValueType(); 8979 SDValue In = Op.getOperand(0); 8980 MVT InVT = In.getSimpleValueType(); 8981 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && 8982 "Invalid TRUNCATE operation"); 8983 8984 if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { 8985 if (VT.getVectorElementType().getSizeInBits() >=8) 8986 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); 8987 8988 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 8989 unsigned NumElts = InVT.getVectorNumElements(); 8990 assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); 8991 if (InVT.getSizeInBits() < 512) { 8992 MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; 8993 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); 8994 InVT = ExtVT; 8995 } 8996 SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); 8997 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); 8998 SDValue CP = DAG.getConstantPool(C, getPointerTy()); 8999 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 9000 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, 9001 MachinePointerInfo::getConstantPool(), 9002 false, false, false, Alignment); 9003 SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld); 9004 SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); 9005 return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); 9006 } 9007 9008 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { 9009 // On AVX2, v4i64 -> v4i32 becomes VPERMD. 9010 if (Subtarget->hasInt256()) { 9011 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; 9012 In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In); 9013 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), 9014 ShufMask); 9015 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, 9016 DAG.getIntPtrConstant(0)); 9017 } 9018 9019 // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS. 9020 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 9021 DAG.getIntPtrConstant(0)); 9022 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 9023 DAG.getIntPtrConstant(2)); 9024 9025 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 9026 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 9027 9028 // The PSHUFD mask: 9029 static const int ShufMask1[] = {0, 2, 0, 0}; 9030 SDValue Undef = DAG.getUNDEF(VT); 9031 OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1); 9032 OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1); 9033 9034 // The MOVLHPS mask: 9035 static const int ShufMask2[] = {0, 1, 4, 5}; 9036 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2); 9037 } 9038 9039 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { 9040 // On AVX2, v8i32 -> v8i16 becomed PSHUFB. 9041 if (Subtarget->hasInt256()) { 9042 In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); 9043 9044 SmallVector<SDValue,32> pshufbMask; 9045 for (unsigned i = 0; i < 2; ++i) { 9046 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); 9047 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); 9048 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); 9049 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); 9050 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); 9051 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); 9052 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); 9053 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); 9054 for (unsigned j = 0; j < 8; ++j) 9055 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 9056 } 9057 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, 9058 &pshufbMask[0], 32); 9059 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); 9060 In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); 9061 9062 static const int ShufMask[] = {0, 2, -1, -1}; 9063 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), 9064 &ShufMask[0]); 9065 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 9066 DAG.getIntPtrConstant(0)); 9067 return DAG.getNode(ISD::BITCAST, DL, VT, In); 9068 } 9069 9070 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 9071 DAG.getIntPtrConstant(0)); 9072 9073 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 9074 DAG.getIntPtrConstant(4)); 9075 9076 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); 9077 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); 9078 9079 // The PSHUFB mask: 9080 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 9081 -1, -1, -1, -1, -1, -1, -1, -1}; 9082 9083 SDValue Undef = DAG.getUNDEF(MVT::v16i8); 9084 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); 9085 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); 9086 9087 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 9088 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 9089 9090 // The MOVLHPS Mask: 9091 static const int ShufMask2[] = {0, 1, 4, 5}; 9092 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); 9093 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res); 9094 } 9095 9096 // Handle truncation of V256 to V128 using shuffles. 9097 if (!VT.is128BitVector() || !InVT.is256BitVector()) 9098 return SDValue(); 9099 9100 assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); 9101 9102 unsigned NumElems = VT.getVectorNumElements(); 9103 EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 9104 NumElems * 2); 9105 9106 SmallVector<int, 16> MaskVec(NumElems * 2, -1); 9107 // Prepare truncation shuffle mask 9108 for (unsigned i = 0; i != NumElems; ++i) 9109 MaskVec[i] = i * 2; 9110 SDValue V = DAG.getVectorShuffle(NVT, DL, 9111 DAG.getNode(ISD::BITCAST, DL, NVT, In), 9112 DAG.getUNDEF(NVT), &MaskVec[0]); 9113 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, 9114 DAG.getIntPtrConstant(0)); 9115} 9116 9117SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 9118 SelectionDAG &DAG) const { 9119 MVT VT = Op.getSimpleValueType(); 9120 if (VT.isVector()) { 9121 if (VT == MVT::v8i16) 9122 return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT, 9123 DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op), 9124 MVT::v8i32, Op.getOperand(0))); 9125 return SDValue(); 9126 } 9127 9128 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 9129 /*IsSigned=*/ true, /*IsReplace=*/ false); 9130 SDValue FIST = Vals.first, StackSlot = Vals.second; 9131 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 9132 if (FIST.getNode() == 0) return Op; 9133 9134 if (StackSlot.getNode()) 9135 // Load the result. 9136 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 9137 FIST, StackSlot, MachinePointerInfo(), 9138 false, false, false, 0); 9139 9140 // The node is the result. 9141 return FIST; 9142} 9143 9144SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 9145 SelectionDAG &DAG) const { 9146 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 9147 /*IsSigned=*/ false, /*IsReplace=*/ false); 9148 SDValue FIST = Vals.first, StackSlot = Vals.second; 9149 assert(FIST.getNode() && "Unexpected failure"); 9150 9151 if (StackSlot.getNode()) 9152 // Load the result. 9153 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 9154 FIST, StackSlot, MachinePointerInfo(), 9155 false, false, false, 0); 9156 9157 // The node is the result. 9158 return FIST; 9159} 9160 9161static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { 9162 SDLoc DL(Op); 9163 MVT VT = Op.getSimpleValueType(); 9164 SDValue In = Op.getOperand(0); 9165 MVT SVT = In.getSimpleValueType(); 9166 9167 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); 9168 9169 return DAG.getNode(X86ISD::VFPEXT, DL, VT, 9170 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, 9171 In, DAG.getUNDEF(SVT))); 9172} 9173 9174SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { 9175 LLVMContext *Context = DAG.getContext(); 9176 SDLoc dl(Op); 9177 MVT VT = Op.getSimpleValueType(); 9178 MVT EltVT = VT; 9179 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 9180 if (VT.isVector()) { 9181 EltVT = VT.getVectorElementType(); 9182 NumElts = VT.getVectorNumElements(); 9183 } 9184 Constant *C; 9185 if (EltVT == MVT::f64) 9186 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 9187 APInt(64, ~(1ULL << 63)))); 9188 else 9189 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, 9190 APInt(32, ~(1U << 31)))); 9191 C = ConstantVector::getSplat(NumElts, C); 9192 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); 9193 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 9194 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9195 MachinePointerInfo::getConstantPool(), 9196 false, false, false, Alignment); 9197 if (VT.isVector()) { 9198 MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 9199 return DAG.getNode(ISD::BITCAST, dl, VT, 9200 DAG.getNode(ISD::AND, dl, ANDVT, 9201 DAG.getNode(ISD::BITCAST, dl, ANDVT, 9202 Op.getOperand(0)), 9203 DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); 9204 } 9205 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 9206} 9207 9208SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 9209 LLVMContext *Context = DAG.getContext(); 9210 SDLoc dl(Op); 9211 MVT VT = Op.getSimpleValueType(); 9212 MVT EltVT = VT; 9213 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 9214 if (VT.isVector()) { 9215 EltVT = VT.getVectorElementType(); 9216 NumElts = VT.getVectorNumElements(); 9217 } 9218 Constant *C; 9219 if (EltVT == MVT::f64) 9220 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 9221 APInt(64, 1ULL << 63))); 9222 else 9223 C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, 9224 APInt(32, 1U << 31))); 9225 C = ConstantVector::getSplat(NumElts, C); 9226 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); 9227 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 9228 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9229 MachinePointerInfo::getConstantPool(), 9230 false, false, false, Alignment); 9231 if (VT.isVector()) { 9232 MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64); 9233 return DAG.getNode(ISD::BITCAST, dl, VT, 9234 DAG.getNode(ISD::XOR, dl, XORVT, 9235 DAG.getNode(ISD::BITCAST, dl, XORVT, 9236 Op.getOperand(0)), 9237 DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); 9238 } 9239 9240 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 9241} 9242 9243SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 9244 LLVMContext *Context = DAG.getContext(); 9245 SDValue Op0 = Op.getOperand(0); 9246 SDValue Op1 = Op.getOperand(1); 9247 SDLoc dl(Op); 9248 MVT VT = Op.getSimpleValueType(); 9249 MVT SrcVT = Op1.getSimpleValueType(); 9250 9251 // If second operand is smaller, extend it first. 9252 if (SrcVT.bitsLT(VT)) { 9253 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 9254 SrcVT = VT; 9255 } 9256 // And if it is bigger, shrink it first. 9257 if (SrcVT.bitsGT(VT)) { 9258 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 9259 SrcVT = VT; 9260 } 9261 9262 // At this point the operands and the result should have the same 9263 // type, and that won't be f80 since that is not custom lowered. 9264 9265 // First get the sign bit of second operand. 9266 SmallVector<Constant*,4> CV; 9267 if (SrcVT == MVT::f64) { 9268 const fltSemantics &Sem = APFloat::IEEEdouble; 9269 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63)))); 9270 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); 9271 } else { 9272 const fltSemantics &Sem = APFloat::IEEEsingle; 9273 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31)))); 9274 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9275 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9276 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9277 } 9278 Constant *C = ConstantVector::get(CV); 9279 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9280 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 9281 MachinePointerInfo::getConstantPool(), 9282 false, false, false, 16); 9283 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 9284 9285 // Shift sign bit right or left if the two operands have different types. 9286 if (SrcVT.bitsGT(VT)) { 9287 // Op0 is MVT::f32, Op1 is MVT::f64. 9288 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 9289 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 9290 DAG.getConstant(32, MVT::i32)); 9291 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 9292 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 9293 DAG.getIntPtrConstant(0)); 9294 } 9295 9296 // Clear first operand sign bit. 9297 CV.clear(); 9298 if (VT == MVT::f64) { 9299 const fltSemantics &Sem = APFloat::IEEEdouble; 9300 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, 9301 APInt(64, ~(1ULL << 63))))); 9302 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); 9303 } else { 9304 const fltSemantics &Sem = APFloat::IEEEsingle; 9305 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, 9306 APInt(32, ~(1U << 31))))); 9307 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9308 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9309 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 9310 } 9311 C = ConstantVector::get(CV); 9312 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9313 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9314 MachinePointerInfo::getConstantPool(), 9315 false, false, false, 16); 9316 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 9317 9318 // Or the value with the sign bit. 9319 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 9320} 9321 9322static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { 9323 SDValue N0 = Op.getOperand(0); 9324 SDLoc dl(Op); 9325 MVT VT = Op.getSimpleValueType(); 9326 9327 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 9328 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 9329 DAG.getConstant(1, VT)); 9330 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 9331} 9332 9333// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. 9334// 9335static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, 9336 SelectionDAG &DAG) { 9337 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); 9338 9339 if (!Subtarget->hasSSE41()) 9340 return SDValue(); 9341 9342 if (!Op->hasOneUse()) 9343 return SDValue(); 9344 9345 SDNode *N = Op.getNode(); 9346 SDLoc DL(N); 9347 9348 SmallVector<SDValue, 8> Opnds; 9349 DenseMap<SDValue, unsigned> VecInMap; 9350 EVT VT = MVT::Other; 9351 9352 // Recognize a special case where a vector is casted into wide integer to 9353 // test all 0s. 9354 Opnds.push_back(N->getOperand(0)); 9355 Opnds.push_back(N->getOperand(1)); 9356 9357 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { 9358 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; 9359 // BFS traverse all OR'd operands. 9360 if (I->getOpcode() == ISD::OR) { 9361 Opnds.push_back(I->getOperand(0)); 9362 Opnds.push_back(I->getOperand(1)); 9363 // Re-evaluate the number of nodes to be traversed. 9364 e += 2; // 2 more nodes (LHS and RHS) are pushed. 9365 continue; 9366 } 9367 9368 // Quit if a non-EXTRACT_VECTOR_ELT 9369 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9370 return SDValue(); 9371 9372 // Quit if without a constant index. 9373 SDValue Idx = I->getOperand(1); 9374 if (!isa<ConstantSDNode>(Idx)) 9375 return SDValue(); 9376 9377 SDValue ExtractedFromVec = I->getOperand(0); 9378 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); 9379 if (M == VecInMap.end()) { 9380 VT = ExtractedFromVec.getValueType(); 9381 // Quit if not 128/256-bit vector. 9382 if (!VT.is128BitVector() && !VT.is256BitVector()) 9383 return SDValue(); 9384 // Quit if not the same type. 9385 if (VecInMap.begin() != VecInMap.end() && 9386 VT != VecInMap.begin()->first.getValueType()) 9387 return SDValue(); 9388 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; 9389 } 9390 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); 9391 } 9392 9393 assert((VT.is128BitVector() || VT.is256BitVector()) && 9394 "Not extracted from 128-/256-bit vector."); 9395 9396 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; 9397 SmallVector<SDValue, 8> VecIns; 9398 9399 for (DenseMap<SDValue, unsigned>::const_iterator 9400 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { 9401 // Quit if not all elements are used. 9402 if (I->second != FullMask) 9403 return SDValue(); 9404 VecIns.push_back(I->first); 9405 } 9406 9407 EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 9408 9409 // Cast all vectors into TestVT for PTEST. 9410 for (unsigned i = 0, e = VecIns.size(); i < e; ++i) 9411 VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); 9412 9413 // If more than one full vectors are evaluated, OR them first before PTEST. 9414 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { 9415 // Each iteration will OR 2 nodes and append the result until there is only 9416 // 1 node left, i.e. the final OR'd value of all vectors. 9417 SDValue LHS = VecIns[Slot]; 9418 SDValue RHS = VecIns[Slot + 1]; 9419 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); 9420 } 9421 9422 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, 9423 VecIns.back(), VecIns.back()); 9424} 9425 9426/// Emit nodes that will be selected as "test Op0,Op0", or something 9427/// equivalent. 9428SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 9429 SelectionDAG &DAG) const { 9430 SDLoc dl(Op); 9431 9432 // CF and OF aren't always set the way we want. Determine which 9433 // of these we need. 9434 bool NeedCF = false; 9435 bool NeedOF = false; 9436 switch (X86CC) { 9437 default: break; 9438 case X86::COND_A: case X86::COND_AE: 9439 case X86::COND_B: case X86::COND_BE: 9440 NeedCF = true; 9441 break; 9442 case X86::COND_G: case X86::COND_GE: 9443 case X86::COND_L: case X86::COND_LE: 9444 case X86::COND_O: case X86::COND_NO: 9445 NeedOF = true; 9446 break; 9447 } 9448 9449 // See if we can use the EFLAGS value from the operand instead of 9450 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 9451 // we prove that the arithmetic won't overflow, we can't use OF or CF. 9452 if (Op.getResNo() != 0 || NeedOF || NeedCF) 9453 // Emit a CMP with 0, which is the TEST pattern. 9454 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 9455 DAG.getConstant(0, Op.getValueType())); 9456 9457 unsigned Opcode = 0; 9458 unsigned NumOperands = 0; 9459 9460 // Truncate operations may prevent the merge of the SETCC instruction 9461 // and the arithmetic instruction before it. Attempt to truncate the operands 9462 // of the arithmetic instruction and use a reduced bit-width instruction. 9463 bool NeedTruncation = false; 9464 SDValue ArithOp = Op; 9465 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { 9466 SDValue Arith = Op->getOperand(0); 9467 // Both the trunc and the arithmetic op need to have one user each. 9468 if (Arith->hasOneUse()) 9469 switch (Arith.getOpcode()) { 9470 default: break; 9471 case ISD::ADD: 9472 case ISD::SUB: 9473 case ISD::AND: 9474 case ISD::OR: 9475 case ISD::XOR: { 9476 NeedTruncation = true; 9477 ArithOp = Arith; 9478 } 9479 } 9480 } 9481 9482 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation 9483 // which may be the result of a CAST. We use the variable 'Op', which is the 9484 // non-casted variable when we check for possible users. 9485 switch (ArithOp.getOpcode()) { 9486 case ISD::ADD: 9487 // Due to an isel shortcoming, be conservative if this add is likely to be 9488 // selected as part of a load-modify-store instruction. When the root node 9489 // in a match is a store, isel doesn't know how to remap non-chain non-flag 9490 // uses of other nodes in the match, such as the ADD in this case. This 9491 // leads to the ADD being left around and reselected, with the result being 9492 // two adds in the output. Alas, even if none our users are stores, that 9493 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 9494 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 9495 // climbing the DAG back to the root, and it doesn't seem to be worth the 9496 // effort. 9497 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9498 UE = Op.getNode()->use_end(); UI != UE; ++UI) 9499 if (UI->getOpcode() != ISD::CopyToReg && 9500 UI->getOpcode() != ISD::SETCC && 9501 UI->getOpcode() != ISD::STORE) 9502 goto default_case; 9503 9504 if (ConstantSDNode *C = 9505 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { 9506 // An add of one will be selected as an INC. 9507 if (C->getAPIntValue() == 1) { 9508 Opcode = X86ISD::INC; 9509 NumOperands = 1; 9510 break; 9511 } 9512 9513 // An add of negative one (subtract of one) will be selected as a DEC. 9514 if (C->getAPIntValue().isAllOnesValue()) { 9515 Opcode = X86ISD::DEC; 9516 NumOperands = 1; 9517 break; 9518 } 9519 } 9520 9521 // Otherwise use a regular EFLAGS-setting add. 9522 Opcode = X86ISD::ADD; 9523 NumOperands = 2; 9524 break; 9525 case ISD::AND: { 9526 // If the primary and result isn't used, don't bother using X86ISD::AND, 9527 // because a TEST instruction will be better. 9528 bool NonFlagUse = false; 9529 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9530 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 9531 SDNode *User = *UI; 9532 unsigned UOpNo = UI.getOperandNo(); 9533 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 9534 // Look pass truncate. 9535 UOpNo = User->use_begin().getOperandNo(); 9536 User = *User->use_begin(); 9537 } 9538 9539 if (User->getOpcode() != ISD::BRCOND && 9540 User->getOpcode() != ISD::SETCC && 9541 !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { 9542 NonFlagUse = true; 9543 break; 9544 } 9545 } 9546 9547 if (!NonFlagUse) 9548 break; 9549 } 9550 // FALL THROUGH 9551 case ISD::SUB: 9552 case ISD::OR: 9553 case ISD::XOR: 9554 // Due to the ISEL shortcoming noted above, be conservative if this op is 9555 // likely to be selected as part of a load-modify-store instruction. 9556 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9557 UE = Op.getNode()->use_end(); UI != UE; ++UI) 9558 if (UI->getOpcode() == ISD::STORE) 9559 goto default_case; 9560 9561 // Otherwise use a regular EFLAGS-setting instruction. 9562 switch (ArithOp.getOpcode()) { 9563 default: llvm_unreachable("unexpected operator!"); 9564 case ISD::SUB: Opcode = X86ISD::SUB; break; 9565 case ISD::XOR: Opcode = X86ISD::XOR; break; 9566 case ISD::AND: Opcode = X86ISD::AND; break; 9567 case ISD::OR: { 9568 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { 9569 SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); 9570 if (EFLAGS.getNode()) 9571 return EFLAGS; 9572 } 9573 Opcode = X86ISD::OR; 9574 break; 9575 } 9576 } 9577 9578 NumOperands = 2; 9579 break; 9580 case X86ISD::ADD: 9581 case X86ISD::SUB: 9582 case X86ISD::INC: 9583 case X86ISD::DEC: 9584 case X86ISD::OR: 9585 case X86ISD::XOR: 9586 case X86ISD::AND: 9587 return SDValue(Op.getNode(), 1); 9588 default: 9589 default_case: 9590 break; 9591 } 9592 9593 // If we found that truncation is beneficial, perform the truncation and 9594 // update 'Op'. 9595 if (NeedTruncation) { 9596 EVT VT = Op.getValueType(); 9597 SDValue WideVal = Op->getOperand(0); 9598 EVT WideVT = WideVal.getValueType(); 9599 unsigned ConvertedOp = 0; 9600 // Use a target machine opcode to prevent further DAGCombine 9601 // optimizations that may separate the arithmetic operations 9602 // from the setcc node. 9603 switch (WideVal.getOpcode()) { 9604 default: break; 9605 case ISD::ADD: ConvertedOp = X86ISD::ADD; break; 9606 case ISD::SUB: ConvertedOp = X86ISD::SUB; break; 9607 case ISD::AND: ConvertedOp = X86ISD::AND; break; 9608 case ISD::OR: ConvertedOp = X86ISD::OR; break; 9609 case ISD::XOR: ConvertedOp = X86ISD::XOR; break; 9610 } 9611 9612 if (ConvertedOp) { 9613 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9614 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { 9615 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); 9616 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); 9617 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); 9618 } 9619 } 9620 } 9621 9622 if (Opcode == 0) 9623 // Emit a CMP with 0, which is the TEST pattern. 9624 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 9625 DAG.getConstant(0, Op.getValueType())); 9626 9627 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 9628 SmallVector<SDValue, 4> Ops; 9629 for (unsigned i = 0; i != NumOperands; ++i) 9630 Ops.push_back(Op.getOperand(i)); 9631 9632 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 9633 DAG.ReplaceAllUsesWith(Op, New); 9634 return SDValue(New.getNode(), 1); 9635} 9636 9637/// Emit nodes that will be selected as "cmp Op0,Op1", or something 9638/// equivalent. 9639SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 9640 SelectionDAG &DAG) const { 9641 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 9642 if (C->getAPIntValue() == 0) 9643 return EmitTest(Op0, X86CC, DAG); 9644 9645 SDLoc dl(Op0); 9646 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || 9647 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { 9648 // Use SUB instead of CMP to enable CSE between SUB and CMP. 9649 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); 9650 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, 9651 Op0, Op1); 9652 return SDValue(Sub.getNode(), 1); 9653 } 9654 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 9655} 9656 9657/// Convert a comparison if required by the subtarget. 9658SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, 9659 SelectionDAG &DAG) const { 9660 // If the subtarget does not support the FUCOMI instruction, floating-point 9661 // comparisons have to be converted. 9662 if (Subtarget->hasCMov() || 9663 Cmp.getOpcode() != X86ISD::CMP || 9664 !Cmp.getOperand(0).getValueType().isFloatingPoint() || 9665 !Cmp.getOperand(1).getValueType().isFloatingPoint()) 9666 return Cmp; 9667 9668 // The instruction selector will select an FUCOM instruction instead of 9669 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence 9670 // build an SDNode sequence that transfers the result from FPSW into EFLAGS: 9671 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) 9672 SDLoc dl(Cmp); 9673 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); 9674 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); 9675 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, 9676 DAG.getConstant(8, MVT::i8)); 9677 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); 9678 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); 9679} 9680 9681static bool isAllOnes(SDValue V) { 9682 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 9683 return C && C->isAllOnesValue(); 9684} 9685 9686/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 9687/// if it's possible. 9688SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 9689 SDLoc dl, SelectionDAG &DAG) const { 9690 SDValue Op0 = And.getOperand(0); 9691 SDValue Op1 = And.getOperand(1); 9692 if (Op0.getOpcode() == ISD::TRUNCATE) 9693 Op0 = Op0.getOperand(0); 9694 if (Op1.getOpcode() == ISD::TRUNCATE) 9695 Op1 = Op1.getOperand(0); 9696 9697 SDValue LHS, RHS; 9698 if (Op1.getOpcode() == ISD::SHL) 9699 std::swap(Op0, Op1); 9700 if (Op0.getOpcode() == ISD::SHL) { 9701 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 9702 if (And00C->getZExtValue() == 1) { 9703 // If we looked past a truncate, check that it's only truncating away 9704 // known zeros. 9705 unsigned BitWidth = Op0.getValueSizeInBits(); 9706 unsigned AndBitWidth = And.getValueSizeInBits(); 9707 if (BitWidth > AndBitWidth) { 9708 APInt Zeros, Ones; 9709 DAG.ComputeMaskedBits(Op0, Zeros, Ones); 9710 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 9711 return SDValue(); 9712 } 9713 LHS = Op1; 9714 RHS = Op0.getOperand(1); 9715 } 9716 } else if (Op1.getOpcode() == ISD::Constant) { 9717 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 9718 uint64_t AndRHSVal = AndRHS->getZExtValue(); 9719 SDValue AndLHS = Op0; 9720 9721 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 9722 LHS = AndLHS.getOperand(0); 9723 RHS = AndLHS.getOperand(1); 9724 } 9725 9726 // Use BT if the immediate can't be encoded in a TEST instruction. 9727 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 9728 LHS = AndLHS; 9729 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); 9730 } 9731 } 9732 9733 if (LHS.getNode()) { 9734 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 9735 // instruction. Since the shift amount is in-range-or-undefined, we know 9736 // that doing a bittest on the i32 value is ok. We extend to i32 because 9737 // the encoding for the i16 version is larger than the i32 version. 9738 // Also promote i16 to i32 for performance / code size reason. 9739 if (LHS.getValueType() == MVT::i8 || 9740 LHS.getValueType() == MVT::i16) 9741 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 9742 9743 // If the operand types disagree, extend the shift amount to match. Since 9744 // BT ignores high bits (like shifts) we can use anyextend. 9745 if (LHS.getValueType() != RHS.getValueType()) 9746 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 9747 9748 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 9749 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 9750 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9751 DAG.getConstant(Cond, MVT::i8), BT); 9752 } 9753 9754 return SDValue(); 9755} 9756 9757/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point 9758/// mask CMPs. 9759static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, 9760 SDValue &Op1) { 9761 unsigned SSECC; 9762 bool Swap = false; 9763 9764 // SSE Condition code mapping: 9765 // 0 - EQ 9766 // 1 - LT 9767 // 2 - LE 9768 // 3 - UNORD 9769 // 4 - NEQ 9770 // 5 - NLT 9771 // 6 - NLE 9772 // 7 - ORD 9773 switch (SetCCOpcode) { 9774 default: llvm_unreachable("Unexpected SETCC condition"); 9775 case ISD::SETOEQ: 9776 case ISD::SETEQ: SSECC = 0; break; 9777 case ISD::SETOGT: 9778 case ISD::SETGT: Swap = true; // Fallthrough 9779 case ISD::SETLT: 9780 case ISD::SETOLT: SSECC = 1; break; 9781 case ISD::SETOGE: 9782 case ISD::SETGE: Swap = true; // Fallthrough 9783 case ISD::SETLE: 9784 case ISD::SETOLE: SSECC = 2; break; 9785 case ISD::SETUO: SSECC = 3; break; 9786 case ISD::SETUNE: 9787 case ISD::SETNE: SSECC = 4; break; 9788 case ISD::SETULE: Swap = true; // Fallthrough 9789 case ISD::SETUGE: SSECC = 5; break; 9790 case ISD::SETULT: Swap = true; // Fallthrough 9791 case ISD::SETUGT: SSECC = 6; break; 9792 case ISD::SETO: SSECC = 7; break; 9793 case ISD::SETUEQ: 9794 case ISD::SETONE: SSECC = 8; break; 9795 } 9796 if (Swap) 9797 std::swap(Op0, Op1); 9798 9799 return SSECC; 9800} 9801 9802// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 9803// ones, and then concatenate the result back. 9804static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 9805 MVT VT = Op.getSimpleValueType(); 9806 9807 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && 9808 "Unsupported value type for operation"); 9809 9810 unsigned NumElems = VT.getVectorNumElements(); 9811 SDLoc dl(Op); 9812 SDValue CC = Op.getOperand(2); 9813 9814 // Extract the LHS vectors 9815 SDValue LHS = Op.getOperand(0); 9816 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 9817 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 9818 9819 // Extract the RHS vectors 9820 SDValue RHS = Op.getOperand(1); 9821 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 9822 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 9823 9824 // Issue the operation on the smaller types and concatenate the result back 9825 MVT EltVT = VT.getVectorElementType(); 9826 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 9827 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 9828 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 9829 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 9830} 9831 9832static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { 9833 SDValue Op0 = Op.getOperand(0); 9834 SDValue Op1 = Op.getOperand(1); 9835 SDValue CC = Op.getOperand(2); 9836 MVT VT = Op.getSimpleValueType(); 9837 9838 assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 && 9839 Op.getValueType().getScalarType() == MVT::i1 && 9840 "Cannot set masked compare for this operation"); 9841 9842 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 9843 SDLoc dl(Op); 9844 9845 bool Unsigned = false; 9846 unsigned SSECC; 9847 switch (SetCCOpcode) { 9848 default: llvm_unreachable("Unexpected SETCC condition"); 9849 case ISD::SETNE: SSECC = 4; break; 9850 case ISD::SETEQ: SSECC = 0; break; 9851 case ISD::SETUGT: Unsigned = true; 9852 case ISD::SETGT: SSECC = 6; break; // NLE 9853 case ISD::SETULT: Unsigned = true; 9854 case ISD::SETLT: SSECC = 1; break; 9855 case ISD::SETUGE: Unsigned = true; 9856 case ISD::SETGE: SSECC = 5; break; // NLT 9857 case ISD::SETULE: Unsigned = true; 9858 case ISD::SETLE: SSECC = 2; break; 9859 } 9860 unsigned Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; 9861 return DAG.getNode(Opc, dl, VT, Op0, Op1, 9862 DAG.getConstant(SSECC, MVT::i8)); 9863 9864} 9865 9866static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, 9867 SelectionDAG &DAG) { 9868 SDValue Op0 = Op.getOperand(0); 9869 SDValue Op1 = Op.getOperand(1); 9870 SDValue CC = Op.getOperand(2); 9871 MVT VT = Op.getSimpleValueType(); 9872 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 9873 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); 9874 SDLoc dl(Op); 9875 9876 if (isFP) { 9877#ifndef NDEBUG 9878 MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); 9879 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 9880#endif 9881 9882 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); 9883 unsigned Opc = X86ISD::CMPP; 9884 if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { 9885 assert(VT.getVectorNumElements() <= 16); 9886 Opc = X86ISD::CMPM; 9887 } 9888 // In the two special cases we can't handle, emit two comparisons. 9889 if (SSECC == 8) { 9890 unsigned CC0, CC1; 9891 unsigned CombineOpc; 9892 if (SetCCOpcode == ISD::SETUEQ) { 9893 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; 9894 } else { 9895 assert(SetCCOpcode == ISD::SETONE); 9896 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; 9897 } 9898 9899 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, 9900 DAG.getConstant(CC0, MVT::i8)); 9901 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, 9902 DAG.getConstant(CC1, MVT::i8)); 9903 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); 9904 } 9905 // Handle all other FP comparisons here. 9906 return DAG.getNode(Opc, dl, VT, Op0, Op1, 9907 DAG.getConstant(SSECC, MVT::i8)); 9908 } 9909 9910 // Break 256-bit integer vector compare into smaller ones. 9911 if (VT.is256BitVector() && !Subtarget->hasInt256()) 9912 return Lower256IntVSETCC(Op, DAG); 9913 9914 bool MaskResult = (VT.getVectorElementType() == MVT::i1); 9915 EVT OpVT = Op1.getValueType(); 9916 if (Subtarget->hasAVX512()) { 9917 if (Op1.getValueType().is512BitVector() || 9918 (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) 9919 return LowerIntVSETCC_AVX512(Op, DAG); 9920 9921 // In AVX-512 architecture setcc returns mask with i1 elements, 9922 // But there is no compare instruction for i8 and i16 elements. 9923 // We are not talking about 512-bit operands in this case, these 9924 // types are illegal. 9925 if (MaskResult && 9926 (OpVT.getVectorElementType().getSizeInBits() < 32 && 9927 OpVT.getVectorElementType().getSizeInBits() >= 8)) 9928 return DAG.getNode(ISD::TRUNCATE, dl, VT, 9929 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); 9930 } 9931 9932 // We are handling one of the integer comparisons here. Since SSE only has 9933 // GT and EQ comparisons for integer, swapping operands and multiple 9934 // operations may be required for some comparisons. 9935 unsigned Opc; 9936 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; 9937 9938 switch (SetCCOpcode) { 9939 default: llvm_unreachable("Unexpected SETCC condition"); 9940 case ISD::SETNE: Invert = true; 9941 case ISD::SETEQ: Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break; 9942 case ISD::SETLT: Swap = true; 9943 case ISD::SETGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break; 9944 case ISD::SETGE: Swap = true; 9945 case ISD::SETLE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; 9946 Invert = true; break; 9947 case ISD::SETULT: Swap = true; 9948 case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; 9949 FlipSigns = true; break; 9950 case ISD::SETUGE: Swap = true; 9951 case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; 9952 FlipSigns = true; Invert = true; break; 9953 } 9954 9955 // Special case: Use min/max operations for SETULE/SETUGE 9956 MVT VET = VT.getVectorElementType(); 9957 bool hasMinMax = 9958 (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) 9959 || (Subtarget->hasSSE2() && (VET == MVT::i8)); 9960 9961 if (hasMinMax) { 9962 switch (SetCCOpcode) { 9963 default: break; 9964 case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; 9965 case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; 9966 } 9967 9968 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } 9969 } 9970 9971 if (Swap) 9972 std::swap(Op0, Op1); 9973 9974 // Check that the operation in question is available (most are plain SSE2, 9975 // but PCMPGTQ and PCMPEQQ have different requirements). 9976 if (VT == MVT::v2i64) { 9977 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { 9978 assert(Subtarget->hasSSE2() && "Don't know how to lower!"); 9979 9980 // First cast everything to the right type. 9981 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 9982 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 9983 9984 // Since SSE has no unsigned integer comparisons, we need to flip the sign 9985 // bits of the inputs before performing those operations. The lower 9986 // compare is always unsigned. 9987 SDValue SB; 9988 if (FlipSigns) { 9989 SB = DAG.getConstant(0x80000000U, MVT::v4i32); 9990 } else { 9991 SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); 9992 SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); 9993 SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 9994 Sign, Zero, Sign, Zero); 9995 } 9996 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); 9997 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); 9998 9999 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) 10000 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); 10001 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); 10002 10003 // Create masks for only the low parts/high parts of the 64 bit integers. 10004 static const int MaskHi[] = { 1, 1, 3, 3 }; 10005 static const int MaskLo[] = { 0, 0, 2, 2 }; 10006 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); 10007 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); 10008 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); 10009 10010 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); 10011 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); 10012 10013 if (Invert) 10014 Result = DAG.getNOT(dl, Result, MVT::v4i32); 10015 10016 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 10017 } 10018 10019 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { 10020 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with 10021 // pcmpeqd + pshufd + pand. 10022 assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); 10023 10024 // First cast everything to the right type. 10025 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 10026 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 10027 10028 // Do the compare. 10029 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); 10030 10031 // Make sure the lower and upper halves are both all-ones. 10032 static const int Mask[] = { 1, 0, 3, 2 }; 10033 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); 10034 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); 10035 10036 if (Invert) 10037 Result = DAG.getNOT(dl, Result, MVT::v4i32); 10038 10039 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 10040 } 10041 } 10042 10043 // Since SSE has no unsigned integer comparisons, we need to flip the sign 10044 // bits of the inputs before performing those operations. 10045 if (FlipSigns) { 10046 EVT EltVT = VT.getVectorElementType(); 10047 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); 10048 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); 10049 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); 10050 } 10051 10052 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 10053 10054 // If the logical-not of the result is required, perform that now. 10055 if (Invert) 10056 Result = DAG.getNOT(dl, Result, VT); 10057 10058 if (MinMax) 10059 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); 10060 10061 return Result; 10062} 10063 10064SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 10065 10066 MVT VT = Op.getSimpleValueType(); 10067 10068 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); 10069 10070 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); 10071 SDValue Op0 = Op.getOperand(0); 10072 SDValue Op1 = Op.getOperand(1); 10073 SDLoc dl(Op); 10074 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 10075 10076 // Optimize to BT if possible. 10077 // Lower (X & (1 << N)) == 0 to BT(X, N). 10078 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 10079 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 10080 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 10081 Op1.getOpcode() == ISD::Constant && 10082 cast<ConstantSDNode>(Op1)->isNullValue() && 10083 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 10084 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 10085 if (NewSetCC.getNode()) 10086 return NewSetCC; 10087 } 10088 10089 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 10090 // these. 10091 if (Op1.getOpcode() == ISD::Constant && 10092 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 10093 cast<ConstantSDNode>(Op1)->isNullValue()) && 10094 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 10095 10096 // If the input is a setcc, then reuse the input setcc or use a new one with 10097 // the inverted condition. 10098 if (Op0.getOpcode() == X86ISD::SETCC) { 10099 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 10100 bool Invert = (CC == ISD::SETNE) ^ 10101 cast<ConstantSDNode>(Op1)->isNullValue(); 10102 if (!Invert) return Op0; 10103 10104 CCode = X86::GetOppositeBranchCondition(CCode); 10105 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 10106 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 10107 } 10108 } 10109 10110 bool isFP = Op1.getSimpleValueType().isFloatingPoint(); 10111 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 10112 if (X86CC == X86::COND_INVALID) 10113 return SDValue(); 10114 10115 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 10116 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 10117 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 10118 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 10119} 10120 10121// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 10122static bool isX86LogicalCmp(SDValue Op) { 10123 unsigned Opc = Op.getNode()->getOpcode(); 10124 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || 10125 Opc == X86ISD::SAHF) 10126 return true; 10127 if (Op.getResNo() == 1 && 10128 (Opc == X86ISD::ADD || 10129 Opc == X86ISD::SUB || 10130 Opc == X86ISD::ADC || 10131 Opc == X86ISD::SBB || 10132 Opc == X86ISD::SMUL || 10133 Opc == X86ISD::UMUL || 10134 Opc == X86ISD::INC || 10135 Opc == X86ISD::DEC || 10136 Opc == X86ISD::OR || 10137 Opc == X86ISD::XOR || 10138 Opc == X86ISD::AND)) 10139 return true; 10140 10141 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 10142 return true; 10143 10144 return false; 10145} 10146 10147static bool isZero(SDValue V) { 10148 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 10149 return C && C->isNullValue(); 10150} 10151 10152static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { 10153 if (V.getOpcode() != ISD::TRUNCATE) 10154 return false; 10155 10156 SDValue VOp0 = V.getOperand(0); 10157 unsigned InBits = VOp0.getValueSizeInBits(); 10158 unsigned Bits = V.getValueSizeInBits(); 10159 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); 10160} 10161 10162SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 10163 bool addTest = true; 10164 SDValue Cond = Op.getOperand(0); 10165 SDValue Op1 = Op.getOperand(1); 10166 SDValue Op2 = Op.getOperand(2); 10167 SDLoc DL(Op); 10168 EVT VT = Op1.getValueType(); 10169 SDValue CC; 10170 10171 // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops 10172 // are available. Otherwise fp cmovs get lowered into a less efficient branch 10173 // sequence later on. 10174 if (Cond.getOpcode() == ISD::SETCC && 10175 ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || 10176 (Subtarget->hasSSE1() && VT == MVT::f32)) && 10177 VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { 10178 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); 10179 int SSECC = translateX86FSETCC( 10180 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); 10181 10182 if (SSECC != 8) { 10183 unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd; 10184 SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1, 10185 DAG.getConstant(SSECC, MVT::i8)); 10186 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); 10187 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); 10188 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); 10189 } 10190 } 10191 10192 if (Cond.getOpcode() == ISD::SETCC) { 10193 SDValue NewCond = LowerSETCC(Cond, DAG); 10194 if (NewCond.getNode()) 10195 Cond = NewCond; 10196 } 10197 10198 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 10199 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 10200 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 10201 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 10202 if (Cond.getOpcode() == X86ISD::SETCC && 10203 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 10204 isZero(Cond.getOperand(1).getOperand(1))) { 10205 SDValue Cmp = Cond.getOperand(1); 10206 10207 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 10208 10209 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 10210 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 10211 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 10212 10213 SDValue CmpOp0 = Cmp.getOperand(0); 10214 // Apply further optimizations for special cases 10215 // (select (x != 0), -1, 0) -> neg & sbb 10216 // (select (x == 0), 0, -1) -> neg & sbb 10217 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) 10218 if (YC->isNullValue() && 10219 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { 10220 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); 10221 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 10222 DAG.getConstant(0, CmpOp0.getValueType()), 10223 CmpOp0); 10224 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 10225 DAG.getConstant(X86::COND_B, MVT::i8), 10226 SDValue(Neg.getNode(), 1)); 10227 return Res; 10228 } 10229 10230 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 10231 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 10232 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 10233 10234 SDValue Res = // Res = 0 or -1. 10235 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 10236 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 10237 10238 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 10239 Res = DAG.getNOT(DL, Res, Res.getValueType()); 10240 10241 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 10242 if (N2C == 0 || !N2C->isNullValue()) 10243 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 10244 return Res; 10245 } 10246 } 10247 10248 // Look past (and (setcc_carry (cmp ...)), 1). 10249 if (Cond.getOpcode() == ISD::AND && 10250 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 10251 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 10252 if (C && C->getAPIntValue() == 1) 10253 Cond = Cond.getOperand(0); 10254 } 10255 10256 // If condition flag is set by a X86ISD::CMP, then use it as the condition 10257 // setting operand in place of the X86ISD::SETCC. 10258 unsigned CondOpcode = Cond.getOpcode(); 10259 if (CondOpcode == X86ISD::SETCC || 10260 CondOpcode == X86ISD::SETCC_CARRY) { 10261 CC = Cond.getOperand(0); 10262 10263 SDValue Cmp = Cond.getOperand(1); 10264 unsigned Opc = Cmp.getOpcode(); 10265 MVT VT = Op.getSimpleValueType(); 10266 10267 bool IllegalFPCMov = false; 10268 if (VT.isFloatingPoint() && !VT.isVector() && 10269 !isScalarFPTypeInSSEReg(VT)) // FPStack? 10270 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 10271 10272 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 10273 Opc == X86ISD::BT) { // FIXME 10274 Cond = Cmp; 10275 addTest = false; 10276 } 10277 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 10278 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 10279 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 10280 Cond.getOperand(0).getValueType() != MVT::i8)) { 10281 SDValue LHS = Cond.getOperand(0); 10282 SDValue RHS = Cond.getOperand(1); 10283 unsigned X86Opcode; 10284 unsigned X86Cond; 10285 SDVTList VTs; 10286 switch (CondOpcode) { 10287 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 10288 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 10289 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 10290 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 10291 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 10292 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 10293 default: llvm_unreachable("unexpected overflowing operator"); 10294 } 10295 if (CondOpcode == ISD::UMULO) 10296 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 10297 MVT::i32); 10298 else 10299 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 10300 10301 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 10302 10303 if (CondOpcode == ISD::UMULO) 10304 Cond = X86Op.getValue(2); 10305 else 10306 Cond = X86Op.getValue(1); 10307 10308 CC = DAG.getConstant(X86Cond, MVT::i8); 10309 addTest = false; 10310 } 10311 10312 if (addTest) { 10313 // Look pass the truncate if the high bits are known zero. 10314 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 10315 Cond = Cond.getOperand(0); 10316 10317 // We know the result of AND is compared against zero. Try to match 10318 // it to BT. 10319 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 10320 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 10321 if (NewSetCC.getNode()) { 10322 CC = NewSetCC.getOperand(0); 10323 Cond = NewSetCC.getOperand(1); 10324 addTest = false; 10325 } 10326 } 10327 } 10328 10329 if (addTest) { 10330 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10331 Cond = EmitTest(Cond, X86::COND_NE, DAG); 10332 } 10333 10334 // a < b ? -1 : 0 -> RES = ~setcc_carry 10335 // a < b ? 0 : -1 -> RES = setcc_carry 10336 // a >= b ? -1 : 0 -> RES = setcc_carry 10337 // a >= b ? 0 : -1 -> RES = ~setcc_carry 10338 if (Cond.getOpcode() == X86ISD::SUB) { 10339 Cond = ConvertCmpIfNecessary(Cond, DAG); 10340 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 10341 10342 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 10343 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 10344 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 10345 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 10346 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 10347 return DAG.getNOT(DL, Res, Res.getValueType()); 10348 return Res; 10349 } 10350 } 10351 10352 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate 10353 // widen the cmov and push the truncate through. This avoids introducing a new 10354 // branch during isel and doesn't add any extensions. 10355 if (Op.getValueType() == MVT::i8 && 10356 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { 10357 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); 10358 if (T1.getValueType() == T2.getValueType() && 10359 // Blacklist CopyFromReg to avoid partial register stalls. 10360 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ 10361 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); 10362 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); 10363 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); 10364 } 10365 } 10366 10367 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 10368 // condition is true. 10369 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 10370 SDValue Ops[] = { Op2, Op1, CC, Cond }; 10371 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 10372} 10373 10374static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { 10375 MVT VT = Op->getSimpleValueType(0); 10376 SDValue In = Op->getOperand(0); 10377 MVT InVT = In.getSimpleValueType(); 10378 SDLoc dl(Op); 10379 10380 unsigned int NumElts = VT.getVectorNumElements(); 10381 if (NumElts != 8 && NumElts != 16) 10382 return SDValue(); 10383 10384 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 10385 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 10386 10387 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10388 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 10389 10390 MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; 10391 Constant *C = ConstantInt::get(*DAG.getContext(), 10392 APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits())); 10393 10394 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 10395 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 10396 SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP, 10397 MachinePointerInfo::getConstantPool(), 10398 false, false, false, Alignment); 10399 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld); 10400 if (VT.is512BitVector()) 10401 return Brcst; 10402 return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); 10403} 10404 10405static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 10406 SelectionDAG &DAG) { 10407 MVT VT = Op->getSimpleValueType(0); 10408 SDValue In = Op->getOperand(0); 10409 MVT InVT = In.getSimpleValueType(); 10410 SDLoc dl(Op); 10411 10412 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) 10413 return LowerSIGN_EXTEND_AVX512(Op, DAG); 10414 10415 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && 10416 (VT != MVT::v8i32 || InVT != MVT::v8i16) && 10417 (VT != MVT::v16i16 || InVT != MVT::v16i8)) 10418 return SDValue(); 10419 10420 if (Subtarget->hasInt256()) 10421 return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In); 10422 10423 // Optimize vectors in AVX mode 10424 // Sign extend v8i16 to v8i32 and 10425 // v4i32 to v4i64 10426 // 10427 // Divide input vector into two parts 10428 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 10429 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 10430 // concat the vectors to original VT 10431 10432 unsigned NumElems = InVT.getVectorNumElements(); 10433 SDValue Undef = DAG.getUNDEF(InVT); 10434 10435 SmallVector<int,8> ShufMask1(NumElems, -1); 10436 for (unsigned i = 0; i != NumElems/2; ++i) 10437 ShufMask1[i] = i; 10438 10439 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); 10440 10441 SmallVector<int,8> ShufMask2(NumElems, -1); 10442 for (unsigned i = 0; i != NumElems/2; ++i) 10443 ShufMask2[i] = i + NumElems/2; 10444 10445 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); 10446 10447 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), 10448 VT.getVectorNumElements()/2); 10449 10450 OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 10451 OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); 10452 10453 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 10454} 10455 10456// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 10457// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 10458// from the AND / OR. 10459static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 10460 Opc = Op.getOpcode(); 10461 if (Opc != ISD::OR && Opc != ISD::AND) 10462 return false; 10463 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 10464 Op.getOperand(0).hasOneUse() && 10465 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 10466 Op.getOperand(1).hasOneUse()); 10467} 10468 10469// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 10470// 1 and that the SETCC node has a single use. 10471static bool isXor1OfSetCC(SDValue Op) { 10472 if (Op.getOpcode() != ISD::XOR) 10473 return false; 10474 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 10475 if (N1C && N1C->getAPIntValue() == 1) { 10476 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 10477 Op.getOperand(0).hasOneUse(); 10478 } 10479 return false; 10480} 10481 10482SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 10483 bool addTest = true; 10484 SDValue Chain = Op.getOperand(0); 10485 SDValue Cond = Op.getOperand(1); 10486 SDValue Dest = Op.getOperand(2); 10487 SDLoc dl(Op); 10488 SDValue CC; 10489 bool Inverted = false; 10490 10491 if (Cond.getOpcode() == ISD::SETCC) { 10492 // Check for setcc([su]{add,sub,mul}o == 0). 10493 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 10494 isa<ConstantSDNode>(Cond.getOperand(1)) && 10495 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && 10496 Cond.getOperand(0).getResNo() == 1 && 10497 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 10498 Cond.getOperand(0).getOpcode() == ISD::UADDO || 10499 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 10500 Cond.getOperand(0).getOpcode() == ISD::USUBO || 10501 Cond.getOperand(0).getOpcode() == ISD::SMULO || 10502 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 10503 Inverted = true; 10504 Cond = Cond.getOperand(0); 10505 } else { 10506 SDValue NewCond = LowerSETCC(Cond, DAG); 10507 if (NewCond.getNode()) 10508 Cond = NewCond; 10509 } 10510 } 10511#if 0 10512 // FIXME: LowerXALUO doesn't handle these!! 10513 else if (Cond.getOpcode() == X86ISD::ADD || 10514 Cond.getOpcode() == X86ISD::SUB || 10515 Cond.getOpcode() == X86ISD::SMUL || 10516 Cond.getOpcode() == X86ISD::UMUL) 10517 Cond = LowerXALUO(Cond, DAG); 10518#endif 10519 10520 // Look pass (and (setcc_carry (cmp ...)), 1). 10521 if (Cond.getOpcode() == ISD::AND && 10522 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 10523 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 10524 if (C && C->getAPIntValue() == 1) 10525 Cond = Cond.getOperand(0); 10526 } 10527 10528 // If condition flag is set by a X86ISD::CMP, then use it as the condition 10529 // setting operand in place of the X86ISD::SETCC. 10530 unsigned CondOpcode = Cond.getOpcode(); 10531 if (CondOpcode == X86ISD::SETCC || 10532 CondOpcode == X86ISD::SETCC_CARRY) { 10533 CC = Cond.getOperand(0); 10534 10535 SDValue Cmp = Cond.getOperand(1); 10536 unsigned Opc = Cmp.getOpcode(); 10537 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 10538 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 10539 Cond = Cmp; 10540 addTest = false; 10541 } else { 10542 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 10543 default: break; 10544 case X86::COND_O: 10545 case X86::COND_B: 10546 // These can only come from an arithmetic instruction with overflow, 10547 // e.g. SADDO, UADDO. 10548 Cond = Cond.getNode()->getOperand(1); 10549 addTest = false; 10550 break; 10551 } 10552 } 10553 } 10554 CondOpcode = Cond.getOpcode(); 10555 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 10556 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 10557 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 10558 Cond.getOperand(0).getValueType() != MVT::i8)) { 10559 SDValue LHS = Cond.getOperand(0); 10560 SDValue RHS = Cond.getOperand(1); 10561 unsigned X86Opcode; 10562 unsigned X86Cond; 10563 SDVTList VTs; 10564 switch (CondOpcode) { 10565 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 10566 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 10567 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 10568 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 10569 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 10570 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 10571 default: llvm_unreachable("unexpected overflowing operator"); 10572 } 10573 if (Inverted) 10574 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 10575 if (CondOpcode == ISD::UMULO) 10576 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 10577 MVT::i32); 10578 else 10579 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 10580 10581 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 10582 10583 if (CondOpcode == ISD::UMULO) 10584 Cond = X86Op.getValue(2); 10585 else 10586 Cond = X86Op.getValue(1); 10587 10588 CC = DAG.getConstant(X86Cond, MVT::i8); 10589 addTest = false; 10590 } else { 10591 unsigned CondOpc; 10592 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 10593 SDValue Cmp = Cond.getOperand(0).getOperand(1); 10594 if (CondOpc == ISD::OR) { 10595 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 10596 // two branches instead of an explicit OR instruction with a 10597 // separate test. 10598 if (Cmp == Cond.getOperand(1).getOperand(1) && 10599 isX86LogicalCmp(Cmp)) { 10600 CC = Cond.getOperand(0).getOperand(0); 10601 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10602 Chain, Dest, CC, Cmp); 10603 CC = Cond.getOperand(1).getOperand(0); 10604 Cond = Cmp; 10605 addTest = false; 10606 } 10607 } else { // ISD::AND 10608 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 10609 // two branches instead of an explicit AND instruction with a 10610 // separate test. However, we only do this if this block doesn't 10611 // have a fall-through edge, because this requires an explicit 10612 // jmp when the condition is false. 10613 if (Cmp == Cond.getOperand(1).getOperand(1) && 10614 isX86LogicalCmp(Cmp) && 10615 Op.getNode()->hasOneUse()) { 10616 X86::CondCode CCode = 10617 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 10618 CCode = X86::GetOppositeBranchCondition(CCode); 10619 CC = DAG.getConstant(CCode, MVT::i8); 10620 SDNode *User = *Op.getNode()->use_begin(); 10621 // Look for an unconditional branch following this conditional branch. 10622 // We need this because we need to reverse the successors in order 10623 // to implement FCMP_OEQ. 10624 if (User->getOpcode() == ISD::BR) { 10625 SDValue FalseBB = User->getOperand(1); 10626 SDNode *NewBR = 10627 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 10628 assert(NewBR == User); 10629 (void)NewBR; 10630 Dest = FalseBB; 10631 10632 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10633 Chain, Dest, CC, Cmp); 10634 X86::CondCode CCode = 10635 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 10636 CCode = X86::GetOppositeBranchCondition(CCode); 10637 CC = DAG.getConstant(CCode, MVT::i8); 10638 Cond = Cmp; 10639 addTest = false; 10640 } 10641 } 10642 } 10643 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 10644 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 10645 // It should be transformed during dag combiner except when the condition 10646 // is set by a arithmetics with overflow node. 10647 X86::CondCode CCode = 10648 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 10649 CCode = X86::GetOppositeBranchCondition(CCode); 10650 CC = DAG.getConstant(CCode, MVT::i8); 10651 Cond = Cond.getOperand(0).getOperand(1); 10652 addTest = false; 10653 } else if (Cond.getOpcode() == ISD::SETCC && 10654 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 10655 // For FCMP_OEQ, we can emit 10656 // two branches instead of an explicit AND instruction with a 10657 // separate test. However, we only do this if this block doesn't 10658 // have a fall-through edge, because this requires an explicit 10659 // jmp when the condition is false. 10660 if (Op.getNode()->hasOneUse()) { 10661 SDNode *User = *Op.getNode()->use_begin(); 10662 // Look for an unconditional branch following this conditional branch. 10663 // We need this because we need to reverse the successors in order 10664 // to implement FCMP_OEQ. 10665 if (User->getOpcode() == ISD::BR) { 10666 SDValue FalseBB = User->getOperand(1); 10667 SDNode *NewBR = 10668 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 10669 assert(NewBR == User); 10670 (void)NewBR; 10671 Dest = FalseBB; 10672 10673 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 10674 Cond.getOperand(0), Cond.getOperand(1)); 10675 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 10676 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10677 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10678 Chain, Dest, CC, Cmp); 10679 CC = DAG.getConstant(X86::COND_P, MVT::i8); 10680 Cond = Cmp; 10681 addTest = false; 10682 } 10683 } 10684 } else if (Cond.getOpcode() == ISD::SETCC && 10685 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 10686 // For FCMP_UNE, we can emit 10687 // two branches instead of an explicit AND instruction with a 10688 // separate test. However, we only do this if this block doesn't 10689 // have a fall-through edge, because this requires an explicit 10690 // jmp when the condition is false. 10691 if (Op.getNode()->hasOneUse()) { 10692 SDNode *User = *Op.getNode()->use_begin(); 10693 // Look for an unconditional branch following this conditional branch. 10694 // We need this because we need to reverse the successors in order 10695 // to implement FCMP_UNE. 10696 if (User->getOpcode() == ISD::BR) { 10697 SDValue FalseBB = User->getOperand(1); 10698 SDNode *NewBR = 10699 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 10700 assert(NewBR == User); 10701 (void)NewBR; 10702 10703 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 10704 Cond.getOperand(0), Cond.getOperand(1)); 10705 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 10706 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10707 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10708 Chain, Dest, CC, Cmp); 10709 CC = DAG.getConstant(X86::COND_NP, MVT::i8); 10710 Cond = Cmp; 10711 addTest = false; 10712 Dest = FalseBB; 10713 } 10714 } 10715 } 10716 } 10717 10718 if (addTest) { 10719 // Look pass the truncate if the high bits are known zero. 10720 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 10721 Cond = Cond.getOperand(0); 10722 10723 // We know the result of AND is compared against zero. Try to match 10724 // it to BT. 10725 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 10726 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 10727 if (NewSetCC.getNode()) { 10728 CC = NewSetCC.getOperand(0); 10729 Cond = NewSetCC.getOperand(1); 10730 addTest = false; 10731 } 10732 } 10733 } 10734 10735 if (addTest) { 10736 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 10737 Cond = EmitTest(Cond, X86::COND_NE, DAG); 10738 } 10739 Cond = ConvertCmpIfNecessary(Cond, DAG); 10740 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 10741 Chain, Dest, CC, Cond); 10742} 10743 10744// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 10745// Calls to _alloca is needed to probe the stack when allocating more than 4k 10746// bytes in one go. Touching the stack at 4K increments is necessary to ensure 10747// that the guard pages used by the OS virtual memory manager are allocated in 10748// correct sequence. 10749SDValue 10750X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 10751 SelectionDAG &DAG) const { 10752 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || 10753 getTargetMachine().Options.EnableSegmentedStacks) && 10754 "This should be used only on Windows targets or when segmented stacks " 10755 "are being used"); 10756 assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); 10757 SDLoc dl(Op); 10758 10759 // Get the inputs. 10760 SDValue Chain = Op.getOperand(0); 10761 SDValue Size = Op.getOperand(1); 10762 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 10763 EVT VT = Op.getNode()->getValueType(0); 10764 10765 bool Is64Bit = Subtarget->is64Bit(); 10766 EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; 10767 10768 if (getTargetMachine().Options.EnableSegmentedStacks) { 10769 MachineFunction &MF = DAG.getMachineFunction(); 10770 MachineRegisterInfo &MRI = MF.getRegInfo(); 10771 10772 if (Is64Bit) { 10773 // The 64 bit implementation of segmented stacks needs to clobber both r10 10774 // r11. This makes it impossible to use it along with nested parameters. 10775 const Function *F = MF.getFunction(); 10776 10777 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 10778 I != E; ++I) 10779 if (I->hasNestAttr()) 10780 report_fatal_error("Cannot use segmented stacks with functions that " 10781 "have nested arguments."); 10782 } 10783 10784 const TargetRegisterClass *AddrRegClass = 10785 getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); 10786 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 10787 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 10788 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 10789 DAG.getRegister(Vreg, SPTy)); 10790 SDValue Ops1[2] = { Value, Chain }; 10791 return DAG.getMergeValues(Ops1, 2, dl); 10792 } else { 10793 SDValue Flag; 10794 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 10795 10796 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 10797 Flag = Chain.getValue(1); 10798 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 10799 10800 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 10801 10802 const X86RegisterInfo *RegInfo = 10803 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 10804 unsigned SPReg = RegInfo->getStackRegister(); 10805 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); 10806 Chain = SP.getValue(1); 10807 10808 if (Align) { 10809 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 10810 DAG.getConstant(-(uint64_t)Align, VT)); 10811 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); 10812 } 10813 10814 SDValue Ops1[2] = { SP, Chain }; 10815 return DAG.getMergeValues(Ops1, 2, dl); 10816 } 10817} 10818 10819SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 10820 MachineFunction &MF = DAG.getMachineFunction(); 10821 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 10822 10823 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 10824 SDLoc DL(Op); 10825 10826 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 10827 // vastart just stores the address of the VarArgsFrameIndex slot into the 10828 // memory location argument. 10829 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 10830 getPointerTy()); 10831 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 10832 MachinePointerInfo(SV), false, false, 0); 10833 } 10834 10835 // __va_list_tag: 10836 // gp_offset (0 - 6 * 8) 10837 // fp_offset (48 - 48 + 8 * 16) 10838 // overflow_arg_area (point to parameters coming in memory). 10839 // reg_save_area 10840 SmallVector<SDValue, 8> MemOps; 10841 SDValue FIN = Op.getOperand(1); 10842 // Store gp_offset 10843 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 10844 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 10845 MVT::i32), 10846 FIN, MachinePointerInfo(SV), false, false, 0); 10847 MemOps.push_back(Store); 10848 10849 // Store fp_offset 10850 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10851 FIN, DAG.getIntPtrConstant(4)); 10852 Store = DAG.getStore(Op.getOperand(0), DL, 10853 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 10854 MVT::i32), 10855 FIN, MachinePointerInfo(SV, 4), false, false, 0); 10856 MemOps.push_back(Store); 10857 10858 // Store ptr to overflow_arg_area 10859 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10860 FIN, DAG.getIntPtrConstant(4)); 10861 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 10862 getPointerTy()); 10863 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 10864 MachinePointerInfo(SV, 8), 10865 false, false, 0); 10866 MemOps.push_back(Store); 10867 10868 // Store ptr to reg_save_area. 10869 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 10870 FIN, DAG.getIntPtrConstant(8)); 10871 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 10872 getPointerTy()); 10873 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 10874 MachinePointerInfo(SV, 16), false, false, 0); 10875 MemOps.push_back(Store); 10876 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 10877 &MemOps[0], MemOps.size()); 10878} 10879 10880SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 10881 assert(Subtarget->is64Bit() && 10882 "LowerVAARG only handles 64-bit va_arg!"); 10883 assert((Subtarget->isTargetLinux() || 10884 Subtarget->isTargetDarwin()) && 10885 "Unhandled target in LowerVAARG"); 10886 assert(Op.getNode()->getNumOperands() == 4); 10887 SDValue Chain = Op.getOperand(0); 10888 SDValue SrcPtr = Op.getOperand(1); 10889 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 10890 unsigned Align = Op.getConstantOperandVal(3); 10891 SDLoc dl(Op); 10892 10893 EVT ArgVT = Op.getNode()->getValueType(0); 10894 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 10895 uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); 10896 uint8_t ArgMode; 10897 10898 // Decide which area this value should be read from. 10899 // TODO: Implement the AMD64 ABI in its entirety. This simple 10900 // selection mechanism works only for the basic types. 10901 if (ArgVT == MVT::f80) { 10902 llvm_unreachable("va_arg for f80 not yet implemented"); 10903 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 10904 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 10905 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 10906 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 10907 } else { 10908 llvm_unreachable("Unhandled argument type in LowerVAARG"); 10909 } 10910 10911 if (ArgMode == 2) { 10912 // Sanity Check: Make sure using fp_offset makes sense. 10913 assert(!getTargetMachine().Options.UseSoftFloat && 10914 !(DAG.getMachineFunction() 10915 .getFunction()->getAttributes() 10916 .hasAttribute(AttributeSet::FunctionIndex, 10917 Attribute::NoImplicitFloat)) && 10918 Subtarget->hasSSE1()); 10919 } 10920 10921 // Insert VAARG_64 node into the DAG 10922 // VAARG_64 returns two values: Variable Argument Address, Chain 10923 SmallVector<SDValue, 11> InstOps; 10924 InstOps.push_back(Chain); 10925 InstOps.push_back(SrcPtr); 10926 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 10927 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 10928 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 10929 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 10930 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 10931 VTs, &InstOps[0], InstOps.size(), 10932 MVT::i64, 10933 MachinePointerInfo(SV), 10934 /*Align=*/0, 10935 /*Volatile=*/false, 10936 /*ReadMem=*/true, 10937 /*WriteMem=*/true); 10938 Chain = VAARG.getValue(1); 10939 10940 // Load the next argument and return it 10941 return DAG.getLoad(ArgVT, dl, 10942 Chain, 10943 VAARG, 10944 MachinePointerInfo(), 10945 false, false, false, 0); 10946} 10947 10948static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, 10949 SelectionDAG &DAG) { 10950 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 10951 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 10952 SDValue Chain = Op.getOperand(0); 10953 SDValue DstPtr = Op.getOperand(1); 10954 SDValue SrcPtr = Op.getOperand(2); 10955 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 10956 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 10957 SDLoc DL(Op); 10958 10959 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 10960 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 10961 false, 10962 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 10963} 10964 10965// getTargetVShiftByConstNode - Handle vector element shifts where the shift 10966// amount is a constant. Takes immediate version of shift as input. 10967static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT, 10968 SDValue SrcOp, uint64_t ShiftAmt, 10969 SelectionDAG &DAG) { 10970 10971 // Check for ShiftAmt >= element width 10972 if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) { 10973 if (Opc == X86ISD::VSRAI) 10974 ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1; 10975 else 10976 return DAG.getConstant(0, VT); 10977 } 10978 10979 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) 10980 && "Unknown target vector shift-by-constant node"); 10981 10982 return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); 10983} 10984 10985// getTargetVShiftNode - Handle vector element shifts where the shift amount 10986// may or may not be a constant. Takes immediate version of shift as input. 10987static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT, 10988 SDValue SrcOp, SDValue ShAmt, 10989 SelectionDAG &DAG) { 10990 assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); 10991 10992 // Catch shift-by-constant. 10993 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) 10994 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, 10995 CShAmt->getZExtValue(), DAG); 10996 10997 // Change opcode to non-immediate version 10998 switch (Opc) { 10999 default: llvm_unreachable("Unknown target vector shift node"); 11000 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 11001 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 11002 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 11003 } 11004 11005 // Need to build a vector containing shift amount 11006 // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 11007 SDValue ShOps[4]; 11008 ShOps[0] = ShAmt; 11009 ShOps[1] = DAG.getConstant(0, MVT::i32); 11010 ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); 11011 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); 11012 11013 // The return type has to be a 128-bit type with the same element 11014 // type as the input type. 11015 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 11016 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); 11017 11018 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); 11019 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 11020} 11021 11022static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 11023 SDLoc dl(Op); 11024 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 11025 switch (IntNo) { 11026 default: return SDValue(); // Don't custom lower most intrinsics. 11027 // Comparison intrinsics. 11028 case Intrinsic::x86_sse_comieq_ss: 11029 case Intrinsic::x86_sse_comilt_ss: 11030 case Intrinsic::x86_sse_comile_ss: 11031 case Intrinsic::x86_sse_comigt_ss: 11032 case Intrinsic::x86_sse_comige_ss: 11033 case Intrinsic::x86_sse_comineq_ss: 11034 case Intrinsic::x86_sse_ucomieq_ss: 11035 case Intrinsic::x86_sse_ucomilt_ss: 11036 case Intrinsic::x86_sse_ucomile_ss: 11037 case Intrinsic::x86_sse_ucomigt_ss: 11038 case Intrinsic::x86_sse_ucomige_ss: 11039 case Intrinsic::x86_sse_ucomineq_ss: 11040 case Intrinsic::x86_sse2_comieq_sd: 11041 case Intrinsic::x86_sse2_comilt_sd: 11042 case Intrinsic::x86_sse2_comile_sd: 11043 case Intrinsic::x86_sse2_comigt_sd: 11044 case Intrinsic::x86_sse2_comige_sd: 11045 case Intrinsic::x86_sse2_comineq_sd: 11046 case Intrinsic::x86_sse2_ucomieq_sd: 11047 case Intrinsic::x86_sse2_ucomilt_sd: 11048 case Intrinsic::x86_sse2_ucomile_sd: 11049 case Intrinsic::x86_sse2_ucomigt_sd: 11050 case Intrinsic::x86_sse2_ucomige_sd: 11051 case Intrinsic::x86_sse2_ucomineq_sd: { 11052 unsigned Opc; 11053 ISD::CondCode CC; 11054 switch (IntNo) { 11055 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11056 case Intrinsic::x86_sse_comieq_ss: 11057 case Intrinsic::x86_sse2_comieq_sd: 11058 Opc = X86ISD::COMI; 11059 CC = ISD::SETEQ; 11060 break; 11061 case Intrinsic::x86_sse_comilt_ss: 11062 case Intrinsic::x86_sse2_comilt_sd: 11063 Opc = X86ISD::COMI; 11064 CC = ISD::SETLT; 11065 break; 11066 case Intrinsic::x86_sse_comile_ss: 11067 case Intrinsic::x86_sse2_comile_sd: 11068 Opc = X86ISD::COMI; 11069 CC = ISD::SETLE; 11070 break; 11071 case Intrinsic::x86_sse_comigt_ss: 11072 case Intrinsic::x86_sse2_comigt_sd: 11073 Opc = X86ISD::COMI; 11074 CC = ISD::SETGT; 11075 break; 11076 case Intrinsic::x86_sse_comige_ss: 11077 case Intrinsic::x86_sse2_comige_sd: 11078 Opc = X86ISD::COMI; 11079 CC = ISD::SETGE; 11080 break; 11081 case Intrinsic::x86_sse_comineq_ss: 11082 case Intrinsic::x86_sse2_comineq_sd: 11083 Opc = X86ISD::COMI; 11084 CC = ISD::SETNE; 11085 break; 11086 case Intrinsic::x86_sse_ucomieq_ss: 11087 case Intrinsic::x86_sse2_ucomieq_sd: 11088 Opc = X86ISD::UCOMI; 11089 CC = ISD::SETEQ; 11090 break; 11091 case Intrinsic::x86_sse_ucomilt_ss: 11092 case Intrinsic::x86_sse2_ucomilt_sd: 11093 Opc = X86ISD::UCOMI; 11094 CC = ISD::SETLT; 11095 break; 11096 case Intrinsic::x86_sse_ucomile_ss: 11097 case Intrinsic::x86_sse2_ucomile_sd: 11098 Opc = X86ISD::UCOMI; 11099 CC = ISD::SETLE; 11100 break; 11101 case Intrinsic::x86_sse_ucomigt_ss: 11102 case Intrinsic::x86_sse2_ucomigt_sd: 11103 Opc = X86ISD::UCOMI; 11104 CC = ISD::SETGT; 11105 break; 11106 case Intrinsic::x86_sse_ucomige_ss: 11107 case Intrinsic::x86_sse2_ucomige_sd: 11108 Opc = X86ISD::UCOMI; 11109 CC = ISD::SETGE; 11110 break; 11111 case Intrinsic::x86_sse_ucomineq_ss: 11112 case Intrinsic::x86_sse2_ucomineq_sd: 11113 Opc = X86ISD::UCOMI; 11114 CC = ISD::SETNE; 11115 break; 11116 } 11117 11118 SDValue LHS = Op.getOperand(1); 11119 SDValue RHS = Op.getOperand(2); 11120 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 11121 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 11122 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 11123 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 11124 DAG.getConstant(X86CC, MVT::i8), Cond); 11125 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11126 } 11127 11128 // Arithmetic intrinsics. 11129 case Intrinsic::x86_sse2_pmulu_dq: 11130 case Intrinsic::x86_avx2_pmulu_dq: 11131 return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), 11132 Op.getOperand(1), Op.getOperand(2)); 11133 11134 // SSE2/AVX2 sub with unsigned saturation intrinsics 11135 case Intrinsic::x86_sse2_psubus_b: 11136 case Intrinsic::x86_sse2_psubus_w: 11137 case Intrinsic::x86_avx2_psubus_b: 11138 case Intrinsic::x86_avx2_psubus_w: 11139 return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(), 11140 Op.getOperand(1), Op.getOperand(2)); 11141 11142 // SSE3/AVX horizontal add/sub intrinsics 11143 case Intrinsic::x86_sse3_hadd_ps: 11144 case Intrinsic::x86_sse3_hadd_pd: 11145 case Intrinsic::x86_avx_hadd_ps_256: 11146 case Intrinsic::x86_avx_hadd_pd_256: 11147 case Intrinsic::x86_sse3_hsub_ps: 11148 case Intrinsic::x86_sse3_hsub_pd: 11149 case Intrinsic::x86_avx_hsub_ps_256: 11150 case Intrinsic::x86_avx_hsub_pd_256: 11151 case Intrinsic::x86_ssse3_phadd_w_128: 11152 case Intrinsic::x86_ssse3_phadd_d_128: 11153 case Intrinsic::x86_avx2_phadd_w: 11154 case Intrinsic::x86_avx2_phadd_d: 11155 case Intrinsic::x86_ssse3_phsub_w_128: 11156 case Intrinsic::x86_ssse3_phsub_d_128: 11157 case Intrinsic::x86_avx2_phsub_w: 11158 case Intrinsic::x86_avx2_phsub_d: { 11159 unsigned Opcode; 11160 switch (IntNo) { 11161 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11162 case Intrinsic::x86_sse3_hadd_ps: 11163 case Intrinsic::x86_sse3_hadd_pd: 11164 case Intrinsic::x86_avx_hadd_ps_256: 11165 case Intrinsic::x86_avx_hadd_pd_256: 11166 Opcode = X86ISD::FHADD; 11167 break; 11168 case Intrinsic::x86_sse3_hsub_ps: 11169 case Intrinsic::x86_sse3_hsub_pd: 11170 case Intrinsic::x86_avx_hsub_ps_256: 11171 case Intrinsic::x86_avx_hsub_pd_256: 11172 Opcode = X86ISD::FHSUB; 11173 break; 11174 case Intrinsic::x86_ssse3_phadd_w_128: 11175 case Intrinsic::x86_ssse3_phadd_d_128: 11176 case Intrinsic::x86_avx2_phadd_w: 11177 case Intrinsic::x86_avx2_phadd_d: 11178 Opcode = X86ISD::HADD; 11179 break; 11180 case Intrinsic::x86_ssse3_phsub_w_128: 11181 case Intrinsic::x86_ssse3_phsub_d_128: 11182 case Intrinsic::x86_avx2_phsub_w: 11183 case Intrinsic::x86_avx2_phsub_d: 11184 Opcode = X86ISD::HSUB; 11185 break; 11186 } 11187 return DAG.getNode(Opcode, dl, Op.getValueType(), 11188 Op.getOperand(1), Op.getOperand(2)); 11189 } 11190 11191 // SSE2/SSE41/AVX2 integer max/min intrinsics. 11192 case Intrinsic::x86_sse2_pmaxu_b: 11193 case Intrinsic::x86_sse41_pmaxuw: 11194 case Intrinsic::x86_sse41_pmaxud: 11195 case Intrinsic::x86_avx2_pmaxu_b: 11196 case Intrinsic::x86_avx2_pmaxu_w: 11197 case Intrinsic::x86_avx2_pmaxu_d: 11198 case Intrinsic::x86_avx512_pmaxu_d: 11199 case Intrinsic::x86_avx512_pmaxu_q: 11200 case Intrinsic::x86_sse2_pminu_b: 11201 case Intrinsic::x86_sse41_pminuw: 11202 case Intrinsic::x86_sse41_pminud: 11203 case Intrinsic::x86_avx2_pminu_b: 11204 case Intrinsic::x86_avx2_pminu_w: 11205 case Intrinsic::x86_avx2_pminu_d: 11206 case Intrinsic::x86_avx512_pminu_d: 11207 case Intrinsic::x86_avx512_pminu_q: 11208 case Intrinsic::x86_sse41_pmaxsb: 11209 case Intrinsic::x86_sse2_pmaxs_w: 11210 case Intrinsic::x86_sse41_pmaxsd: 11211 case Intrinsic::x86_avx2_pmaxs_b: 11212 case Intrinsic::x86_avx2_pmaxs_w: 11213 case Intrinsic::x86_avx2_pmaxs_d: 11214 case Intrinsic::x86_avx512_pmaxs_d: 11215 case Intrinsic::x86_avx512_pmaxs_q: 11216 case Intrinsic::x86_sse41_pminsb: 11217 case Intrinsic::x86_sse2_pmins_w: 11218 case Intrinsic::x86_sse41_pminsd: 11219 case Intrinsic::x86_avx2_pmins_b: 11220 case Intrinsic::x86_avx2_pmins_w: 11221 case Intrinsic::x86_avx2_pmins_d: 11222 case Intrinsic::x86_avx512_pmins_d: 11223 case Intrinsic::x86_avx512_pmins_q: { 11224 unsigned Opcode; 11225 switch (IntNo) { 11226 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11227 case Intrinsic::x86_sse2_pmaxu_b: 11228 case Intrinsic::x86_sse41_pmaxuw: 11229 case Intrinsic::x86_sse41_pmaxud: 11230 case Intrinsic::x86_avx2_pmaxu_b: 11231 case Intrinsic::x86_avx2_pmaxu_w: 11232 case Intrinsic::x86_avx2_pmaxu_d: 11233 case Intrinsic::x86_avx512_pmaxu_d: 11234 case Intrinsic::x86_avx512_pmaxu_q: 11235 Opcode = X86ISD::UMAX; 11236 break; 11237 case Intrinsic::x86_sse2_pminu_b: 11238 case Intrinsic::x86_sse41_pminuw: 11239 case Intrinsic::x86_sse41_pminud: 11240 case Intrinsic::x86_avx2_pminu_b: 11241 case Intrinsic::x86_avx2_pminu_w: 11242 case Intrinsic::x86_avx2_pminu_d: 11243 case Intrinsic::x86_avx512_pminu_d: 11244 case Intrinsic::x86_avx512_pminu_q: 11245 Opcode = X86ISD::UMIN; 11246 break; 11247 case Intrinsic::x86_sse41_pmaxsb: 11248 case Intrinsic::x86_sse2_pmaxs_w: 11249 case Intrinsic::x86_sse41_pmaxsd: 11250 case Intrinsic::x86_avx2_pmaxs_b: 11251 case Intrinsic::x86_avx2_pmaxs_w: 11252 case Intrinsic::x86_avx2_pmaxs_d: 11253 case Intrinsic::x86_avx512_pmaxs_d: 11254 case Intrinsic::x86_avx512_pmaxs_q: 11255 Opcode = X86ISD::SMAX; 11256 break; 11257 case Intrinsic::x86_sse41_pminsb: 11258 case Intrinsic::x86_sse2_pmins_w: 11259 case Intrinsic::x86_sse41_pminsd: 11260 case Intrinsic::x86_avx2_pmins_b: 11261 case Intrinsic::x86_avx2_pmins_w: 11262 case Intrinsic::x86_avx2_pmins_d: 11263 case Intrinsic::x86_avx512_pmins_d: 11264 case Intrinsic::x86_avx512_pmins_q: 11265 Opcode = X86ISD::SMIN; 11266 break; 11267 } 11268 return DAG.getNode(Opcode, dl, Op.getValueType(), 11269 Op.getOperand(1), Op.getOperand(2)); 11270 } 11271 11272 // SSE/SSE2/AVX floating point max/min intrinsics. 11273 case Intrinsic::x86_sse_max_ps: 11274 case Intrinsic::x86_sse2_max_pd: 11275 case Intrinsic::x86_avx_max_ps_256: 11276 case Intrinsic::x86_avx_max_pd_256: 11277 case Intrinsic::x86_avx512_max_ps_512: 11278 case Intrinsic::x86_avx512_max_pd_512: 11279 case Intrinsic::x86_sse_min_ps: 11280 case Intrinsic::x86_sse2_min_pd: 11281 case Intrinsic::x86_avx_min_ps_256: 11282 case Intrinsic::x86_avx_min_pd_256: 11283 case Intrinsic::x86_avx512_min_ps_512: 11284 case Intrinsic::x86_avx512_min_pd_512: { 11285 unsigned Opcode; 11286 switch (IntNo) { 11287 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11288 case Intrinsic::x86_sse_max_ps: 11289 case Intrinsic::x86_sse2_max_pd: 11290 case Intrinsic::x86_avx_max_ps_256: 11291 case Intrinsic::x86_avx_max_pd_256: 11292 case Intrinsic::x86_avx512_max_ps_512: 11293 case Intrinsic::x86_avx512_max_pd_512: 11294 Opcode = X86ISD::FMAX; 11295 break; 11296 case Intrinsic::x86_sse_min_ps: 11297 case Intrinsic::x86_sse2_min_pd: 11298 case Intrinsic::x86_avx_min_ps_256: 11299 case Intrinsic::x86_avx_min_pd_256: 11300 case Intrinsic::x86_avx512_min_ps_512: 11301 case Intrinsic::x86_avx512_min_pd_512: 11302 Opcode = X86ISD::FMIN; 11303 break; 11304 } 11305 return DAG.getNode(Opcode, dl, Op.getValueType(), 11306 Op.getOperand(1), Op.getOperand(2)); 11307 } 11308 11309 // AVX2 variable shift intrinsics 11310 case Intrinsic::x86_avx2_psllv_d: 11311 case Intrinsic::x86_avx2_psllv_q: 11312 case Intrinsic::x86_avx2_psllv_d_256: 11313 case Intrinsic::x86_avx2_psllv_q_256: 11314 case Intrinsic::x86_avx2_psrlv_d: 11315 case Intrinsic::x86_avx2_psrlv_q: 11316 case Intrinsic::x86_avx2_psrlv_d_256: 11317 case Intrinsic::x86_avx2_psrlv_q_256: 11318 case Intrinsic::x86_avx2_psrav_d: 11319 case Intrinsic::x86_avx2_psrav_d_256: { 11320 unsigned Opcode; 11321 switch (IntNo) { 11322 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11323 case Intrinsic::x86_avx2_psllv_d: 11324 case Intrinsic::x86_avx2_psllv_q: 11325 case Intrinsic::x86_avx2_psllv_d_256: 11326 case Intrinsic::x86_avx2_psllv_q_256: 11327 Opcode = ISD::SHL; 11328 break; 11329 case Intrinsic::x86_avx2_psrlv_d: 11330 case Intrinsic::x86_avx2_psrlv_q: 11331 case Intrinsic::x86_avx2_psrlv_d_256: 11332 case Intrinsic::x86_avx2_psrlv_q_256: 11333 Opcode = ISD::SRL; 11334 break; 11335 case Intrinsic::x86_avx2_psrav_d: 11336 case Intrinsic::x86_avx2_psrav_d_256: 11337 Opcode = ISD::SRA; 11338 break; 11339 } 11340 return DAG.getNode(Opcode, dl, Op.getValueType(), 11341 Op.getOperand(1), Op.getOperand(2)); 11342 } 11343 11344 case Intrinsic::x86_ssse3_pshuf_b_128: 11345 case Intrinsic::x86_avx2_pshuf_b: 11346 return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), 11347 Op.getOperand(1), Op.getOperand(2)); 11348 11349 case Intrinsic::x86_ssse3_psign_b_128: 11350 case Intrinsic::x86_ssse3_psign_w_128: 11351 case Intrinsic::x86_ssse3_psign_d_128: 11352 case Intrinsic::x86_avx2_psign_b: 11353 case Intrinsic::x86_avx2_psign_w: 11354 case Intrinsic::x86_avx2_psign_d: 11355 return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), 11356 Op.getOperand(1), Op.getOperand(2)); 11357 11358 case Intrinsic::x86_sse41_insertps: 11359 return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), 11360 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 11361 11362 case Intrinsic::x86_avx_vperm2f128_ps_256: 11363 case Intrinsic::x86_avx_vperm2f128_pd_256: 11364 case Intrinsic::x86_avx_vperm2f128_si_256: 11365 case Intrinsic::x86_avx2_vperm2i128: 11366 return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), 11367 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 11368 11369 case Intrinsic::x86_avx2_permd: 11370 case Intrinsic::x86_avx2_permps: 11371 // Operands intentionally swapped. Mask is last operand to intrinsic, 11372 // but second operand for node/instruction. 11373 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), 11374 Op.getOperand(2), Op.getOperand(1)); 11375 11376 case Intrinsic::x86_sse_sqrt_ps: 11377 case Intrinsic::x86_sse2_sqrt_pd: 11378 case Intrinsic::x86_avx_sqrt_ps_256: 11379 case Intrinsic::x86_avx_sqrt_pd_256: 11380 return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1)); 11381 11382 // ptest and testp intrinsics. The intrinsic these come from are designed to 11383 // return an integer value, not just an instruction so lower it to the ptest 11384 // or testp pattern and a setcc for the result. 11385 case Intrinsic::x86_sse41_ptestz: 11386 case Intrinsic::x86_sse41_ptestc: 11387 case Intrinsic::x86_sse41_ptestnzc: 11388 case Intrinsic::x86_avx_ptestz_256: 11389 case Intrinsic::x86_avx_ptestc_256: 11390 case Intrinsic::x86_avx_ptestnzc_256: 11391 case Intrinsic::x86_avx_vtestz_ps: 11392 case Intrinsic::x86_avx_vtestc_ps: 11393 case Intrinsic::x86_avx_vtestnzc_ps: 11394 case Intrinsic::x86_avx_vtestz_pd: 11395 case Intrinsic::x86_avx_vtestc_pd: 11396 case Intrinsic::x86_avx_vtestnzc_pd: 11397 case Intrinsic::x86_avx_vtestz_ps_256: 11398 case Intrinsic::x86_avx_vtestc_ps_256: 11399 case Intrinsic::x86_avx_vtestnzc_ps_256: 11400 case Intrinsic::x86_avx_vtestz_pd_256: 11401 case Intrinsic::x86_avx_vtestc_pd_256: 11402 case Intrinsic::x86_avx_vtestnzc_pd_256: { 11403 bool IsTestPacked = false; 11404 unsigned X86CC; 11405 switch (IntNo) { 11406 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 11407 case Intrinsic::x86_avx_vtestz_ps: 11408 case Intrinsic::x86_avx_vtestz_pd: 11409 case Intrinsic::x86_avx_vtestz_ps_256: 11410 case Intrinsic::x86_avx_vtestz_pd_256: 11411 IsTestPacked = true; // Fallthrough 11412 case Intrinsic::x86_sse41_ptestz: 11413 case Intrinsic::x86_avx_ptestz_256: 11414 // ZF = 1 11415 X86CC = X86::COND_E; 11416 break; 11417 case Intrinsic::x86_avx_vtestc_ps: 11418 case Intrinsic::x86_avx_vtestc_pd: 11419 case Intrinsic::x86_avx_vtestc_ps_256: 11420 case Intrinsic::x86_avx_vtestc_pd_256: 11421 IsTestPacked = true; // Fallthrough 11422 case Intrinsic::x86_sse41_ptestc: 11423 case Intrinsic::x86_avx_ptestc_256: 11424 // CF = 1 11425 X86CC = X86::COND_B; 11426 break; 11427 case Intrinsic::x86_avx_vtestnzc_ps: 11428 case Intrinsic::x86_avx_vtestnzc_pd: 11429 case Intrinsic::x86_avx_vtestnzc_ps_256: 11430 case Intrinsic::x86_avx_vtestnzc_pd_256: 11431 IsTestPacked = true; // Fallthrough 11432 case Intrinsic::x86_sse41_ptestnzc: 11433 case Intrinsic::x86_avx_ptestnzc_256: 11434 // ZF and CF = 0 11435 X86CC = X86::COND_A; 11436 break; 11437 } 11438 11439 SDValue LHS = Op.getOperand(1); 11440 SDValue RHS = Op.getOperand(2); 11441 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 11442 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 11443 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 11444 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 11445 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11446 } 11447 case Intrinsic::x86_avx512_kortestz: 11448 case Intrinsic::x86_avx512_kortestc: { 11449 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B; 11450 SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); 11451 SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); 11452 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 11453 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); 11454 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 11455 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11456 } 11457 11458 // SSE/AVX shift intrinsics 11459 case Intrinsic::x86_sse2_psll_w: 11460 case Intrinsic::x86_sse2_psll_d: 11461 case Intrinsic::x86_sse2_psll_q: 11462 case Intrinsic::x86_avx2_psll_w: 11463 case Intrinsic::x86_avx2_psll_d: 11464 case Intrinsic::x86_avx2_psll_q: 11465 case Intrinsic::x86_sse2_psrl_w: 11466 case Intrinsic::x86_sse2_psrl_d: 11467 case Intrinsic::x86_sse2_psrl_q: 11468 case Intrinsic::x86_avx2_psrl_w: 11469 case Intrinsic::x86_avx2_psrl_d: 11470 case Intrinsic::x86_avx2_psrl_q: 11471 case Intrinsic::x86_sse2_psra_w: 11472 case Intrinsic::x86_sse2_psra_d: 11473 case Intrinsic::x86_avx2_psra_w: 11474 case Intrinsic::x86_avx2_psra_d: { 11475 unsigned Opcode; 11476 switch (IntNo) { 11477 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11478 case Intrinsic::x86_sse2_psll_w: 11479 case Intrinsic::x86_sse2_psll_d: 11480 case Intrinsic::x86_sse2_psll_q: 11481 case Intrinsic::x86_avx2_psll_w: 11482 case Intrinsic::x86_avx2_psll_d: 11483 case Intrinsic::x86_avx2_psll_q: 11484 Opcode = X86ISD::VSHL; 11485 break; 11486 case Intrinsic::x86_sse2_psrl_w: 11487 case Intrinsic::x86_sse2_psrl_d: 11488 case Intrinsic::x86_sse2_psrl_q: 11489 case Intrinsic::x86_avx2_psrl_w: 11490 case Intrinsic::x86_avx2_psrl_d: 11491 case Intrinsic::x86_avx2_psrl_q: 11492 Opcode = X86ISD::VSRL; 11493 break; 11494 case Intrinsic::x86_sse2_psra_w: 11495 case Intrinsic::x86_sse2_psra_d: 11496 case Intrinsic::x86_avx2_psra_w: 11497 case Intrinsic::x86_avx2_psra_d: 11498 Opcode = X86ISD::VSRA; 11499 break; 11500 } 11501 return DAG.getNode(Opcode, dl, Op.getValueType(), 11502 Op.getOperand(1), Op.getOperand(2)); 11503 } 11504 11505 // SSE/AVX immediate shift intrinsics 11506 case Intrinsic::x86_sse2_pslli_w: 11507 case Intrinsic::x86_sse2_pslli_d: 11508 case Intrinsic::x86_sse2_pslli_q: 11509 case Intrinsic::x86_avx2_pslli_w: 11510 case Intrinsic::x86_avx2_pslli_d: 11511 case Intrinsic::x86_avx2_pslli_q: 11512 case Intrinsic::x86_sse2_psrli_w: 11513 case Intrinsic::x86_sse2_psrli_d: 11514 case Intrinsic::x86_sse2_psrli_q: 11515 case Intrinsic::x86_avx2_psrli_w: 11516 case Intrinsic::x86_avx2_psrli_d: 11517 case Intrinsic::x86_avx2_psrli_q: 11518 case Intrinsic::x86_sse2_psrai_w: 11519 case Intrinsic::x86_sse2_psrai_d: 11520 case Intrinsic::x86_avx2_psrai_w: 11521 case Intrinsic::x86_avx2_psrai_d: { 11522 unsigned Opcode; 11523 switch (IntNo) { 11524 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11525 case Intrinsic::x86_sse2_pslli_w: 11526 case Intrinsic::x86_sse2_pslli_d: 11527 case Intrinsic::x86_sse2_pslli_q: 11528 case Intrinsic::x86_avx2_pslli_w: 11529 case Intrinsic::x86_avx2_pslli_d: 11530 case Intrinsic::x86_avx2_pslli_q: 11531 Opcode = X86ISD::VSHLI; 11532 break; 11533 case Intrinsic::x86_sse2_psrli_w: 11534 case Intrinsic::x86_sse2_psrli_d: 11535 case Intrinsic::x86_sse2_psrli_q: 11536 case Intrinsic::x86_avx2_psrli_w: 11537 case Intrinsic::x86_avx2_psrli_d: 11538 case Intrinsic::x86_avx2_psrli_q: 11539 Opcode = X86ISD::VSRLI; 11540 break; 11541 case Intrinsic::x86_sse2_psrai_w: 11542 case Intrinsic::x86_sse2_psrai_d: 11543 case Intrinsic::x86_avx2_psrai_w: 11544 case Intrinsic::x86_avx2_psrai_d: 11545 Opcode = X86ISD::VSRAI; 11546 break; 11547 } 11548 return getTargetVShiftNode(Opcode, dl, Op.getValueType(), 11549 Op.getOperand(1), Op.getOperand(2), DAG); 11550 } 11551 11552 case Intrinsic::x86_sse42_pcmpistria128: 11553 case Intrinsic::x86_sse42_pcmpestria128: 11554 case Intrinsic::x86_sse42_pcmpistric128: 11555 case Intrinsic::x86_sse42_pcmpestric128: 11556 case Intrinsic::x86_sse42_pcmpistrio128: 11557 case Intrinsic::x86_sse42_pcmpestrio128: 11558 case Intrinsic::x86_sse42_pcmpistris128: 11559 case Intrinsic::x86_sse42_pcmpestris128: 11560 case Intrinsic::x86_sse42_pcmpistriz128: 11561 case Intrinsic::x86_sse42_pcmpestriz128: { 11562 unsigned Opcode; 11563 unsigned X86CC; 11564 switch (IntNo) { 11565 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11566 case Intrinsic::x86_sse42_pcmpistria128: 11567 Opcode = X86ISD::PCMPISTRI; 11568 X86CC = X86::COND_A; 11569 break; 11570 case Intrinsic::x86_sse42_pcmpestria128: 11571 Opcode = X86ISD::PCMPESTRI; 11572 X86CC = X86::COND_A; 11573 break; 11574 case Intrinsic::x86_sse42_pcmpistric128: 11575 Opcode = X86ISD::PCMPISTRI; 11576 X86CC = X86::COND_B; 11577 break; 11578 case Intrinsic::x86_sse42_pcmpestric128: 11579 Opcode = X86ISD::PCMPESTRI; 11580 X86CC = X86::COND_B; 11581 break; 11582 case Intrinsic::x86_sse42_pcmpistrio128: 11583 Opcode = X86ISD::PCMPISTRI; 11584 X86CC = X86::COND_O; 11585 break; 11586 case Intrinsic::x86_sse42_pcmpestrio128: 11587 Opcode = X86ISD::PCMPESTRI; 11588 X86CC = X86::COND_O; 11589 break; 11590 case Intrinsic::x86_sse42_pcmpistris128: 11591 Opcode = X86ISD::PCMPISTRI; 11592 X86CC = X86::COND_S; 11593 break; 11594 case Intrinsic::x86_sse42_pcmpestris128: 11595 Opcode = X86ISD::PCMPESTRI; 11596 X86CC = X86::COND_S; 11597 break; 11598 case Intrinsic::x86_sse42_pcmpistriz128: 11599 Opcode = X86ISD::PCMPISTRI; 11600 X86CC = X86::COND_E; 11601 break; 11602 case Intrinsic::x86_sse42_pcmpestriz128: 11603 Opcode = X86ISD::PCMPESTRI; 11604 X86CC = X86::COND_E; 11605 break; 11606 } 11607 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 11608 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 11609 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 11610 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 11611 DAG.getConstant(X86CC, MVT::i8), 11612 SDValue(PCMP.getNode(), 1)); 11613 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 11614 } 11615 11616 case Intrinsic::x86_sse42_pcmpistri128: 11617 case Intrinsic::x86_sse42_pcmpestri128: { 11618 unsigned Opcode; 11619 if (IntNo == Intrinsic::x86_sse42_pcmpistri128) 11620 Opcode = X86ISD::PCMPISTRI; 11621 else 11622 Opcode = X86ISD::PCMPESTRI; 11623 11624 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 11625 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 11626 return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 11627 } 11628 case Intrinsic::x86_fma_vfmadd_ps: 11629 case Intrinsic::x86_fma_vfmadd_pd: 11630 case Intrinsic::x86_fma_vfmsub_ps: 11631 case Intrinsic::x86_fma_vfmsub_pd: 11632 case Intrinsic::x86_fma_vfnmadd_ps: 11633 case Intrinsic::x86_fma_vfnmadd_pd: 11634 case Intrinsic::x86_fma_vfnmsub_ps: 11635 case Intrinsic::x86_fma_vfnmsub_pd: 11636 case Intrinsic::x86_fma_vfmaddsub_ps: 11637 case Intrinsic::x86_fma_vfmaddsub_pd: 11638 case Intrinsic::x86_fma_vfmsubadd_ps: 11639 case Intrinsic::x86_fma_vfmsubadd_pd: 11640 case Intrinsic::x86_fma_vfmadd_ps_256: 11641 case Intrinsic::x86_fma_vfmadd_pd_256: 11642 case Intrinsic::x86_fma_vfmsub_ps_256: 11643 case Intrinsic::x86_fma_vfmsub_pd_256: 11644 case Intrinsic::x86_fma_vfnmadd_ps_256: 11645 case Intrinsic::x86_fma_vfnmadd_pd_256: 11646 case Intrinsic::x86_fma_vfnmsub_ps_256: 11647 case Intrinsic::x86_fma_vfnmsub_pd_256: 11648 case Intrinsic::x86_fma_vfmaddsub_ps_256: 11649 case Intrinsic::x86_fma_vfmaddsub_pd_256: 11650 case Intrinsic::x86_fma_vfmsubadd_ps_256: 11651 case Intrinsic::x86_fma_vfmsubadd_pd_256: { 11652 unsigned Opc; 11653 switch (IntNo) { 11654 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 11655 case Intrinsic::x86_fma_vfmadd_ps: 11656 case Intrinsic::x86_fma_vfmadd_pd: 11657 case Intrinsic::x86_fma_vfmadd_ps_256: 11658 case Intrinsic::x86_fma_vfmadd_pd_256: 11659 Opc = X86ISD::FMADD; 11660 break; 11661 case Intrinsic::x86_fma_vfmsub_ps: 11662 case Intrinsic::x86_fma_vfmsub_pd: 11663 case Intrinsic::x86_fma_vfmsub_ps_256: 11664 case Intrinsic::x86_fma_vfmsub_pd_256: 11665 Opc = X86ISD::FMSUB; 11666 break; 11667 case Intrinsic::x86_fma_vfnmadd_ps: 11668 case Intrinsic::x86_fma_vfnmadd_pd: 11669 case Intrinsic::x86_fma_vfnmadd_ps_256: 11670 case Intrinsic::x86_fma_vfnmadd_pd_256: 11671 Opc = X86ISD::FNMADD; 11672 break; 11673 case Intrinsic::x86_fma_vfnmsub_ps: 11674 case Intrinsic::x86_fma_vfnmsub_pd: 11675 case Intrinsic::x86_fma_vfnmsub_ps_256: 11676 case Intrinsic::x86_fma_vfnmsub_pd_256: 11677 Opc = X86ISD::FNMSUB; 11678 break; 11679 case Intrinsic::x86_fma_vfmaddsub_ps: 11680 case Intrinsic::x86_fma_vfmaddsub_pd: 11681 case Intrinsic::x86_fma_vfmaddsub_ps_256: 11682 case Intrinsic::x86_fma_vfmaddsub_pd_256: 11683 Opc = X86ISD::FMADDSUB; 11684 break; 11685 case Intrinsic::x86_fma_vfmsubadd_ps: 11686 case Intrinsic::x86_fma_vfmsubadd_pd: 11687 case Intrinsic::x86_fma_vfmsubadd_ps_256: 11688 case Intrinsic::x86_fma_vfmsubadd_pd_256: 11689 Opc = X86ISD::FMSUBADD; 11690 break; 11691 } 11692 11693 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), 11694 Op.getOperand(2), Op.getOperand(3)); 11695 } 11696 } 11697} 11698 11699static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11700 SDValue Base, SDValue Index, 11701 SDValue ScaleOp, SDValue Chain, 11702 const X86Subtarget * Subtarget) { 11703 SDLoc dl(Op); 11704 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11705 assert(C && "Invalid scale type"); 11706 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11707 SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 11708 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11709 Index.getValueType().getVectorNumElements()); 11710 SDValue MaskInReg = DAG.getConstant(~0, MaskVT); 11711 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); 11712 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11713 SDValue Segment = DAG.getRegister(0, MVT::i32); 11714 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 11715 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11716 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; 11717 return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); 11718} 11719 11720static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11721 SDValue Src, SDValue Mask, SDValue Base, 11722 SDValue Index, SDValue ScaleOp, SDValue Chain, 11723 const X86Subtarget * Subtarget) { 11724 SDLoc dl(Op); 11725 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11726 assert(C && "Invalid scale type"); 11727 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11728 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11729 Index.getValueType().getVectorNumElements()); 11730 SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 11731 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); 11732 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11733 SDValue Segment = DAG.getRegister(0, MVT::i32); 11734 if (Src.getOpcode() == ISD::UNDEF) 11735 Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 11736 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 11737 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11738 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; 11739 return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); 11740} 11741 11742static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11743 SDValue Src, SDValue Base, SDValue Index, 11744 SDValue ScaleOp, SDValue Chain) { 11745 SDLoc dl(Op); 11746 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11747 assert(C && "Invalid scale type"); 11748 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11749 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11750 SDValue Segment = DAG.getRegister(0, MVT::i32); 11751 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11752 Index.getValueType().getVectorNumElements()); 11753 SDValue MaskInReg = DAG.getConstant(~0, MaskVT); 11754 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); 11755 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; 11756 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11757 return SDValue(Res, 1); 11758} 11759 11760static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 11761 SDValue Src, SDValue Mask, SDValue Base, 11762 SDValue Index, SDValue ScaleOp, SDValue Chain) { 11763 SDLoc dl(Op); 11764 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 11765 assert(C && "Invalid scale type"); 11766 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 11767 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 11768 SDValue Segment = DAG.getRegister(0, MVT::i32); 11769 EVT MaskVT = MVT::getVectorVT(MVT::i1, 11770 Index.getValueType().getVectorNumElements()); 11771 SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 11772 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); 11773 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; 11774 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 11775 return SDValue(Res, 1); 11776} 11777 11778static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, 11779 SelectionDAG &DAG) { 11780 SDLoc dl(Op); 11781 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 11782 switch (IntNo) { 11783 default: return SDValue(); // Don't custom lower most intrinsics. 11784 11785 // RDRAND/RDSEED intrinsics. 11786 case Intrinsic::x86_rdrand_16: 11787 case Intrinsic::x86_rdrand_32: 11788 case Intrinsic::x86_rdrand_64: 11789 case Intrinsic::x86_rdseed_16: 11790 case Intrinsic::x86_rdseed_32: 11791 case Intrinsic::x86_rdseed_64: { 11792 unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 || 11793 IntNo == Intrinsic::x86_rdseed_32 || 11794 IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED : 11795 X86ISD::RDRAND; 11796 // Emit the node with the right value type. 11797 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); 11798 SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0)); 11799 11800 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. 11801 // Otherwise return the value from Rand, which is always 0, casted to i32. 11802 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), 11803 DAG.getConstant(1, Op->getValueType(1)), 11804 DAG.getConstant(X86::COND_B, MVT::i32), 11805 SDValue(Result.getNode(), 1) }; 11806 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, 11807 DAG.getVTList(Op->getValueType(1), MVT::Glue), 11808 Ops, array_lengthof(Ops)); 11809 11810 // Return { result, isValid, chain }. 11811 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, 11812 SDValue(Result.getNode(), 2)); 11813 } 11814 //int_gather(index, base, scale); 11815 case Intrinsic::x86_avx512_gather_qpd_512: 11816 case Intrinsic::x86_avx512_gather_qps_512: 11817 case Intrinsic::x86_avx512_gather_dpd_512: 11818 case Intrinsic::x86_avx512_gather_qpi_512: 11819 case Intrinsic::x86_avx512_gather_qpq_512: 11820 case Intrinsic::x86_avx512_gather_dpq_512: 11821 case Intrinsic::x86_avx512_gather_dps_512: 11822 case Intrinsic::x86_avx512_gather_dpi_512: { 11823 unsigned Opc; 11824 switch (IntNo) { 11825 default: llvm_unreachable("Unexpected intrinsic!"); 11826 case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break; 11827 case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break; 11828 case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break; 11829 case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break; 11830 case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break; 11831 case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break; 11832 case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break; 11833 case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break; 11834 } 11835 SDValue Chain = Op.getOperand(0); 11836 SDValue Index = Op.getOperand(2); 11837 SDValue Base = Op.getOperand(3); 11838 SDValue Scale = Op.getOperand(4); 11839 return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget); 11840 } 11841 //int_gather_mask(v1, mask, index, base, scale); 11842 case Intrinsic::x86_avx512_gather_qps_mask_512: 11843 case Intrinsic::x86_avx512_gather_qpd_mask_512: 11844 case Intrinsic::x86_avx512_gather_dpd_mask_512: 11845 case Intrinsic::x86_avx512_gather_dps_mask_512: 11846 case Intrinsic::x86_avx512_gather_qpi_mask_512: 11847 case Intrinsic::x86_avx512_gather_qpq_mask_512: 11848 case Intrinsic::x86_avx512_gather_dpi_mask_512: 11849 case Intrinsic::x86_avx512_gather_dpq_mask_512: { 11850 unsigned Opc; 11851 switch (IntNo) { 11852 default: llvm_unreachable("Unexpected intrinsic!"); 11853 case Intrinsic::x86_avx512_gather_qps_mask_512: 11854 Opc = X86::VGATHERQPSZrm; break; 11855 case Intrinsic::x86_avx512_gather_qpd_mask_512: 11856 Opc = X86::VGATHERQPDZrm; break; 11857 case Intrinsic::x86_avx512_gather_dpd_mask_512: 11858 Opc = X86::VGATHERDPDZrm; break; 11859 case Intrinsic::x86_avx512_gather_dps_mask_512: 11860 Opc = X86::VGATHERDPSZrm; break; 11861 case Intrinsic::x86_avx512_gather_qpi_mask_512: 11862 Opc = X86::VPGATHERQDZrm; break; 11863 case Intrinsic::x86_avx512_gather_qpq_mask_512: 11864 Opc = X86::VPGATHERQQZrm; break; 11865 case Intrinsic::x86_avx512_gather_dpi_mask_512: 11866 Opc = X86::VPGATHERDDZrm; break; 11867 case Intrinsic::x86_avx512_gather_dpq_mask_512: 11868 Opc = X86::VPGATHERDQZrm; break; 11869 } 11870 SDValue Chain = Op.getOperand(0); 11871 SDValue Src = Op.getOperand(2); 11872 SDValue Mask = Op.getOperand(3); 11873 SDValue Index = Op.getOperand(4); 11874 SDValue Base = Op.getOperand(5); 11875 SDValue Scale = Op.getOperand(6); 11876 return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain, 11877 Subtarget); 11878 } 11879 //int_scatter(base, index, v1, scale); 11880 case Intrinsic::x86_avx512_scatter_qpd_512: 11881 case Intrinsic::x86_avx512_scatter_qps_512: 11882 case Intrinsic::x86_avx512_scatter_dpd_512: 11883 case Intrinsic::x86_avx512_scatter_qpi_512: 11884 case Intrinsic::x86_avx512_scatter_qpq_512: 11885 case Intrinsic::x86_avx512_scatter_dpq_512: 11886 case Intrinsic::x86_avx512_scatter_dps_512: 11887 case Intrinsic::x86_avx512_scatter_dpi_512: { 11888 unsigned Opc; 11889 switch (IntNo) { 11890 default: llvm_unreachable("Unexpected intrinsic!"); 11891 case Intrinsic::x86_avx512_scatter_qpd_512: 11892 Opc = X86::VSCATTERQPDZmr; break; 11893 case Intrinsic::x86_avx512_scatter_qps_512: 11894 Opc = X86::VSCATTERQPSZmr; break; 11895 case Intrinsic::x86_avx512_scatter_dpd_512: 11896 Opc = X86::VSCATTERDPDZmr; break; 11897 case Intrinsic::x86_avx512_scatter_dps_512: 11898 Opc = X86::VSCATTERDPSZmr; break; 11899 case Intrinsic::x86_avx512_scatter_qpi_512: 11900 Opc = X86::VPSCATTERQDZmr; break; 11901 case Intrinsic::x86_avx512_scatter_qpq_512: 11902 Opc = X86::VPSCATTERQQZmr; break; 11903 case Intrinsic::x86_avx512_scatter_dpq_512: 11904 Opc = X86::VPSCATTERDQZmr; break; 11905 case Intrinsic::x86_avx512_scatter_dpi_512: 11906 Opc = X86::VPSCATTERDDZmr; break; 11907 } 11908 SDValue Chain = Op.getOperand(0); 11909 SDValue Base = Op.getOperand(2); 11910 SDValue Index = Op.getOperand(3); 11911 SDValue Src = Op.getOperand(4); 11912 SDValue Scale = Op.getOperand(5); 11913 return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain); 11914 } 11915 //int_scatter_mask(base, mask, index, v1, scale); 11916 case Intrinsic::x86_avx512_scatter_qps_mask_512: 11917 case Intrinsic::x86_avx512_scatter_qpd_mask_512: 11918 case Intrinsic::x86_avx512_scatter_dpd_mask_512: 11919 case Intrinsic::x86_avx512_scatter_dps_mask_512: 11920 case Intrinsic::x86_avx512_scatter_qpi_mask_512: 11921 case Intrinsic::x86_avx512_scatter_qpq_mask_512: 11922 case Intrinsic::x86_avx512_scatter_dpi_mask_512: 11923 case Intrinsic::x86_avx512_scatter_dpq_mask_512: { 11924 unsigned Opc; 11925 switch (IntNo) { 11926 default: llvm_unreachable("Unexpected intrinsic!"); 11927 case Intrinsic::x86_avx512_scatter_qpd_mask_512: 11928 Opc = X86::VSCATTERQPDZmr; break; 11929 case Intrinsic::x86_avx512_scatter_qps_mask_512: 11930 Opc = X86::VSCATTERQPSZmr; break; 11931 case Intrinsic::x86_avx512_scatter_dpd_mask_512: 11932 Opc = X86::VSCATTERDPDZmr; break; 11933 case Intrinsic::x86_avx512_scatter_dps_mask_512: 11934 Opc = X86::VSCATTERDPSZmr; break; 11935 case Intrinsic::x86_avx512_scatter_qpi_mask_512: 11936 Opc = X86::VPSCATTERQDZmr; break; 11937 case Intrinsic::x86_avx512_scatter_qpq_mask_512: 11938 Opc = X86::VPSCATTERQQZmr; break; 11939 case Intrinsic::x86_avx512_scatter_dpq_mask_512: 11940 Opc = X86::VPSCATTERDQZmr; break; 11941 case Intrinsic::x86_avx512_scatter_dpi_mask_512: 11942 Opc = X86::VPSCATTERDDZmr; break; 11943 } 11944 SDValue Chain = Op.getOperand(0); 11945 SDValue Base = Op.getOperand(2); 11946 SDValue Mask = Op.getOperand(3); 11947 SDValue Index = Op.getOperand(4); 11948 SDValue Src = Op.getOperand(5); 11949 SDValue Scale = Op.getOperand(6); 11950 return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain); 11951 } 11952 // XTEST intrinsics. 11953 case Intrinsic::x86_xtest: { 11954 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 11955 SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); 11956 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 11957 DAG.getConstant(X86::COND_NE, MVT::i8), 11958 InTrans); 11959 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); 11960 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), 11961 Ret, SDValue(InTrans.getNode(), 1)); 11962 } 11963 } 11964} 11965 11966SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 11967 SelectionDAG &DAG) const { 11968 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 11969 MFI->setReturnAddressIsTaken(true); 11970 11971 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 11972 SDLoc dl(Op); 11973 EVT PtrVT = getPointerTy(); 11974 11975 if (Depth > 0) { 11976 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 11977 const X86RegisterInfo *RegInfo = 11978 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 11979 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); 11980 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 11981 DAG.getNode(ISD::ADD, dl, PtrVT, 11982 FrameAddr, Offset), 11983 MachinePointerInfo(), false, false, false, 0); 11984 } 11985 11986 // Just load the return address. 11987 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 11988 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 11989 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 11990} 11991 11992SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 11993 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 11994 MFI->setFrameAddressIsTaken(true); 11995 11996 EVT VT = Op.getValueType(); 11997 SDLoc dl(Op); // FIXME probably not meaningful 11998 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 11999 const X86RegisterInfo *RegInfo = 12000 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12001 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 12002 assert(((FrameReg == X86::RBP && VT == MVT::i64) || 12003 (FrameReg == X86::EBP && VT == MVT::i32)) && 12004 "Invalid Frame Register!"); 12005 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 12006 while (Depth--) 12007 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 12008 MachinePointerInfo(), 12009 false, false, false, 0); 12010 return FrameAddr; 12011} 12012 12013SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 12014 SelectionDAG &DAG) const { 12015 const X86RegisterInfo *RegInfo = 12016 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12017 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); 12018} 12019 12020SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 12021 SDValue Chain = Op.getOperand(0); 12022 SDValue Offset = Op.getOperand(1); 12023 SDValue Handler = Op.getOperand(2); 12024 SDLoc dl (Op); 12025 12026 EVT PtrVT = getPointerTy(); 12027 const X86RegisterInfo *RegInfo = 12028 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 12029 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 12030 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || 12031 (FrameReg == X86::EBP && PtrVT == MVT::i32)) && 12032 "Invalid Frame Register!"); 12033 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); 12034 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; 12035 12036 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, 12037 DAG.getIntPtrConstant(RegInfo->getSlotSize())); 12038 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); 12039 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 12040 false, false, 0); 12041 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 12042 12043 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, 12044 DAG.getRegister(StoreAddrReg, PtrVT)); 12045} 12046 12047SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 12048 SelectionDAG &DAG) const { 12049 SDLoc DL(Op); 12050 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, 12051 DAG.getVTList(MVT::i32, MVT::Other), 12052 Op.getOperand(0), Op.getOperand(1)); 12053} 12054 12055SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 12056 SelectionDAG &DAG) const { 12057 SDLoc DL(Op); 12058 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 12059 Op.getOperand(0), Op.getOperand(1)); 12060} 12061 12062static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { 12063 return Op.getOperand(0); 12064} 12065 12066SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 12067 SelectionDAG &DAG) const { 12068 SDValue Root = Op.getOperand(0); 12069 SDValue Trmp = Op.getOperand(1); // trampoline 12070 SDValue FPtr = Op.getOperand(2); // nested function 12071 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 12072 SDLoc dl (Op); 12073 12074 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 12075 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 12076 12077 if (Subtarget->is64Bit()) { 12078 SDValue OutChains[6]; 12079 12080 // Large code-model. 12081 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 12082 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 12083 12084 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; 12085 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; 12086 12087 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 12088 12089 // Load the pointer to the nested function into R11. 12090 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 12091 SDValue Addr = Trmp; 12092 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 12093 Addr, MachinePointerInfo(TrmpAddr), 12094 false, false, 0); 12095 12096 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12097 DAG.getConstant(2, MVT::i64)); 12098 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 12099 MachinePointerInfo(TrmpAddr, 2), 12100 false, false, 2); 12101 12102 // Load the 'nest' parameter value into R10. 12103 // R10 is specified in X86CallingConv.td 12104 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 12105 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12106 DAG.getConstant(10, MVT::i64)); 12107 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 12108 Addr, MachinePointerInfo(TrmpAddr, 10), 12109 false, false, 0); 12110 12111 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12112 DAG.getConstant(12, MVT::i64)); 12113 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 12114 MachinePointerInfo(TrmpAddr, 12), 12115 false, false, 2); 12116 12117 // Jump to the nested function. 12118 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 12119 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12120 DAG.getConstant(20, MVT::i64)); 12121 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 12122 Addr, MachinePointerInfo(TrmpAddr, 20), 12123 false, false, 0); 12124 12125 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 12126 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 12127 DAG.getConstant(22, MVT::i64)); 12128 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 12129 MachinePointerInfo(TrmpAddr, 22), 12130 false, false, 0); 12131 12132 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); 12133 } else { 12134 const Function *Func = 12135 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 12136 CallingConv::ID CC = Func->getCallingConv(); 12137 unsigned NestReg; 12138 12139 switch (CC) { 12140 default: 12141 llvm_unreachable("Unsupported calling convention"); 12142 case CallingConv::C: 12143 case CallingConv::X86_StdCall: { 12144 // Pass 'nest' parameter in ECX. 12145 // Must be kept in sync with X86CallingConv.td 12146 NestReg = X86::ECX; 12147 12148 // Check that ECX wasn't needed by an 'inreg' parameter. 12149 FunctionType *FTy = Func->getFunctionType(); 12150 const AttributeSet &Attrs = Func->getAttributes(); 12151 12152 if (!Attrs.isEmpty() && !Func->isVarArg()) { 12153 unsigned InRegCount = 0; 12154 unsigned Idx = 1; 12155 12156 for (FunctionType::param_iterator I = FTy->param_begin(), 12157 E = FTy->param_end(); I != E; ++I, ++Idx) 12158 if (Attrs.hasAttribute(Idx, Attribute::InReg)) 12159 // FIXME: should only count parameters that are lowered to integers. 12160 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 12161 12162 if (InRegCount > 2) { 12163 report_fatal_error("Nest register in use - reduce number of inreg" 12164 " parameters!"); 12165 } 12166 } 12167 break; 12168 } 12169 case CallingConv::X86_FastCall: 12170 case CallingConv::X86_ThisCall: 12171 case CallingConv::Fast: 12172 // Pass 'nest' parameter in EAX. 12173 // Must be kept in sync with X86CallingConv.td 12174 NestReg = X86::EAX; 12175 break; 12176 } 12177 12178 SDValue OutChains[4]; 12179 SDValue Addr, Disp; 12180 12181 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12182 DAG.getConstant(10, MVT::i32)); 12183 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 12184 12185 // This is storing the opcode for MOV32ri. 12186 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 12187 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; 12188 OutChains[0] = DAG.getStore(Root, dl, 12189 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 12190 Trmp, MachinePointerInfo(TrmpAddr), 12191 false, false, 0); 12192 12193 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12194 DAG.getConstant(1, MVT::i32)); 12195 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 12196 MachinePointerInfo(TrmpAddr, 1), 12197 false, false, 1); 12198 12199 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 12200 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12201 DAG.getConstant(5, MVT::i32)); 12202 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 12203 MachinePointerInfo(TrmpAddr, 5), 12204 false, false, 1); 12205 12206 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 12207 DAG.getConstant(6, MVT::i32)); 12208 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 12209 MachinePointerInfo(TrmpAddr, 6), 12210 false, false, 1); 12211 12212 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); 12213 } 12214} 12215 12216SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 12217 SelectionDAG &DAG) const { 12218 /* 12219 The rounding mode is in bits 11:10 of FPSR, and has the following 12220 settings: 12221 00 Round to nearest 12222 01 Round to -inf 12223 10 Round to +inf 12224 11 Round to 0 12225 12226 FLT_ROUNDS, on the other hand, expects the following: 12227 -1 Undefined 12228 0 Round to 0 12229 1 Round to nearest 12230 2 Round to +inf 12231 3 Round to -inf 12232 12233 To perform the conversion, we do: 12234 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 12235 */ 12236 12237 MachineFunction &MF = DAG.getMachineFunction(); 12238 const TargetMachine &TM = MF.getTarget(); 12239 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 12240 unsigned StackAlignment = TFI.getStackAlignment(); 12241 EVT VT = Op.getValueType(); 12242 SDLoc DL(Op); 12243 12244 // Save FP Control Word to stack slot 12245 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 12246 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 12247 12248 MachineMemOperand *MMO = 12249 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 12250 MachineMemOperand::MOStore, 2, 2); 12251 12252 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 12253 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 12254 DAG.getVTList(MVT::Other), 12255 Ops, array_lengthof(Ops), MVT::i16, 12256 MMO); 12257 12258 // Load FP Control Word from stack slot 12259 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 12260 MachinePointerInfo(), false, false, false, 0); 12261 12262 // Transform as necessary 12263 SDValue CWD1 = 12264 DAG.getNode(ISD::SRL, DL, MVT::i16, 12265 DAG.getNode(ISD::AND, DL, MVT::i16, 12266 CWD, DAG.getConstant(0x800, MVT::i16)), 12267 DAG.getConstant(11, MVT::i8)); 12268 SDValue CWD2 = 12269 DAG.getNode(ISD::SRL, DL, MVT::i16, 12270 DAG.getNode(ISD::AND, DL, MVT::i16, 12271 CWD, DAG.getConstant(0x400, MVT::i16)), 12272 DAG.getConstant(9, MVT::i8)); 12273 12274 SDValue RetVal = 12275 DAG.getNode(ISD::AND, DL, MVT::i16, 12276 DAG.getNode(ISD::ADD, DL, MVT::i16, 12277 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 12278 DAG.getConstant(1, MVT::i16)), 12279 DAG.getConstant(3, MVT::i16)); 12280 12281 return DAG.getNode((VT.getSizeInBits() < 16 ? 12282 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 12283} 12284 12285static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 12286 EVT VT = Op.getValueType(); 12287 EVT OpVT = VT; 12288 unsigned NumBits = VT.getSizeInBits(); 12289 SDLoc dl(Op); 12290 12291 Op = Op.getOperand(0); 12292 if (VT == MVT::i8) { 12293 // Zero extend to i32 since there is not an i8 bsr. 12294 OpVT = MVT::i32; 12295 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 12296 } 12297 12298 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 12299 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 12300 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 12301 12302 // If src is zero (i.e. bsr sets ZF), returns NumBits. 12303 SDValue Ops[] = { 12304 Op, 12305 DAG.getConstant(NumBits+NumBits-1, OpVT), 12306 DAG.getConstant(X86::COND_E, MVT::i8), 12307 Op.getValue(1) 12308 }; 12309 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 12310 12311 // Finally xor with NumBits-1. 12312 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 12313 12314 if (VT == MVT::i8) 12315 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 12316 return Op; 12317} 12318 12319static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { 12320 EVT VT = Op.getValueType(); 12321 EVT OpVT = VT; 12322 unsigned NumBits = VT.getSizeInBits(); 12323 SDLoc dl(Op); 12324 12325 Op = Op.getOperand(0); 12326 if (VT == MVT::i8) { 12327 // Zero extend to i32 since there is not an i8 bsr. 12328 OpVT = MVT::i32; 12329 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 12330 } 12331 12332 // Issue a bsr (scan bits in reverse). 12333 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 12334 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 12335 12336 // And xor with NumBits-1. 12337 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 12338 12339 if (VT == MVT::i8) 12340 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 12341 return Op; 12342} 12343 12344static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 12345 EVT VT = Op.getValueType(); 12346 unsigned NumBits = VT.getSizeInBits(); 12347 SDLoc dl(Op); 12348 Op = Op.getOperand(0); 12349 12350 // Issue a bsf (scan bits forward) which also sets EFLAGS. 12351 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 12352 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 12353 12354 // If src is zero (i.e. bsf sets ZF), returns NumBits. 12355 SDValue Ops[] = { 12356 Op, 12357 DAG.getConstant(NumBits, VT), 12358 DAG.getConstant(X86::COND_E, MVT::i8), 12359 Op.getValue(1) 12360 }; 12361 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); 12362} 12363 12364// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 12365// ones, and then concatenate the result back. 12366static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 12367 EVT VT = Op.getValueType(); 12368 12369 assert(VT.is256BitVector() && VT.isInteger() && 12370 "Unsupported value type for operation"); 12371 12372 unsigned NumElems = VT.getVectorNumElements(); 12373 SDLoc dl(Op); 12374 12375 // Extract the LHS vectors 12376 SDValue LHS = Op.getOperand(0); 12377 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 12378 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 12379 12380 // Extract the RHS vectors 12381 SDValue RHS = Op.getOperand(1); 12382 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 12383 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 12384 12385 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 12386 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 12387 12388 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 12389 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 12390 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 12391} 12392 12393static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { 12394 assert(Op.getValueType().is256BitVector() && 12395 Op.getValueType().isInteger() && 12396 "Only handle AVX 256-bit vector integer operation"); 12397 return Lower256IntArith(Op, DAG); 12398} 12399 12400static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { 12401 assert(Op.getValueType().is256BitVector() && 12402 Op.getValueType().isInteger() && 12403 "Only handle AVX 256-bit vector integer operation"); 12404 return Lower256IntArith(Op, DAG); 12405} 12406 12407static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, 12408 SelectionDAG &DAG) { 12409 SDLoc dl(Op); 12410 EVT VT = Op.getValueType(); 12411 12412 // Decompose 256-bit ops into smaller 128-bit ops. 12413 if (VT.is256BitVector() && !Subtarget->hasInt256()) 12414 return Lower256IntArith(Op, DAG); 12415 12416 SDValue A = Op.getOperand(0); 12417 SDValue B = Op.getOperand(1); 12418 12419 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. 12420 if (VT == MVT::v4i32) { 12421 assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && 12422 "Should not custom lower when pmuldq is available!"); 12423 12424 // Extract the odd parts. 12425 static const int UnpackMask[] = { 1, -1, 3, -1 }; 12426 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); 12427 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); 12428 12429 // Multiply the even parts. 12430 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); 12431 // Now multiply odd parts. 12432 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); 12433 12434 Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); 12435 Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); 12436 12437 // Merge the two vectors back together with a shuffle. This expands into 2 12438 // shuffles. 12439 static const int ShufMask[] = { 0, 4, 2, 6 }; 12440 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); 12441 } 12442 12443 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && 12444 "Only know how to lower V2I64/V4I64/V8I64 multiply"); 12445 12446 // Ahi = psrlqi(a, 32); 12447 // Bhi = psrlqi(b, 32); 12448 // 12449 // AloBlo = pmuludq(a, b); 12450 // AloBhi = pmuludq(a, Bhi); 12451 // AhiBlo = pmuludq(Ahi, b); 12452 12453 // AloBhi = psllqi(AloBhi, 32); 12454 // AhiBlo = psllqi(AhiBlo, 32); 12455 // return AloBlo + AloBhi + AhiBlo; 12456 12457 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); 12458 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); 12459 12460 // Bit cast to 32-bit vectors for MULUDQ 12461 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : 12462 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; 12463 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); 12464 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); 12465 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); 12466 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); 12467 12468 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 12469 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 12470 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 12471 12472 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); 12473 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); 12474 12475 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 12476 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 12477} 12478 12479static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { 12480 EVT VT = Op.getValueType(); 12481 EVT EltTy = VT.getVectorElementType(); 12482 unsigned NumElts = VT.getVectorNumElements(); 12483 SDValue N0 = Op.getOperand(0); 12484 SDLoc dl(Op); 12485 12486 // Lower sdiv X, pow2-const. 12487 BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1)); 12488 if (!C) 12489 return SDValue(); 12490 12491 APInt SplatValue, SplatUndef; 12492 unsigned SplatBitSize; 12493 bool HasAnyUndefs; 12494 if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 12495 HasAnyUndefs) || 12496 EltTy.getSizeInBits() < SplatBitSize) 12497 return SDValue(); 12498 12499 if ((SplatValue != 0) && 12500 (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { 12501 unsigned Lg2 = SplatValue.countTrailingZeros(); 12502 // Splat the sign bit. 12503 SmallVector<SDValue, 16> Sz(NumElts, 12504 DAG.getConstant(EltTy.getSizeInBits() - 1, 12505 EltTy)); 12506 SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0, 12507 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0], 12508 NumElts)); 12509 // Add (N0 < 0) ? abs2 - 1 : 0; 12510 SmallVector<SDValue, 16> Amt(NumElts, 12511 DAG.getConstant(EltTy.getSizeInBits() - Lg2, 12512 EltTy)); 12513 SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN, 12514 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0], 12515 NumElts)); 12516 SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); 12517 SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy)); 12518 SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD, 12519 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0], 12520 NumElts)); 12521 12522 // If we're dividing by a positive value, we're done. Otherwise, we must 12523 // negate the result. 12524 if (SplatValue.isNonNegative()) 12525 return SRA; 12526 12527 SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy)); 12528 SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts); 12529 return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA); 12530 } 12531 return SDValue(); 12532} 12533 12534static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, 12535 const X86Subtarget *Subtarget) { 12536 EVT VT = Op.getValueType(); 12537 SDLoc dl(Op); 12538 SDValue R = Op.getOperand(0); 12539 SDValue Amt = Op.getOperand(1); 12540 12541 // Optimize shl/srl/sra with constant shift amount. 12542 if (isSplatVector(Amt.getNode())) { 12543 SDValue SclrAmt = Amt->getOperand(0); 12544 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 12545 uint64_t ShiftAmt = C->getZExtValue(); 12546 12547 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 12548 (Subtarget->hasInt256() && 12549 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) || 12550 (Subtarget->hasAVX512() && 12551 (VT == MVT::v8i64 || VT == MVT::v16i32))) { 12552 if (Op.getOpcode() == ISD::SHL) 12553 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, 12554 DAG); 12555 if (Op.getOpcode() == ISD::SRL) 12556 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, 12557 DAG); 12558 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) 12559 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, 12560 DAG); 12561 } 12562 12563 if (VT == MVT::v16i8) { 12564 if (Op.getOpcode() == ISD::SHL) { 12565 // Make a large shift. 12566 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, 12567 MVT::v8i16, R, ShiftAmt, 12568 DAG); 12569 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 12570 // Zero out the rightmost bits. 12571 SmallVector<SDValue, 16> V(16, 12572 DAG.getConstant(uint8_t(-1U << ShiftAmt), 12573 MVT::i8)); 12574 return DAG.getNode(ISD::AND, dl, VT, SHL, 12575 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 12576 } 12577 if (Op.getOpcode() == ISD::SRL) { 12578 // Make a large shift. 12579 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, 12580 MVT::v8i16, R, ShiftAmt, 12581 DAG); 12582 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 12583 // Zero out the leftmost bits. 12584 SmallVector<SDValue, 16> V(16, 12585 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 12586 MVT::i8)); 12587 return DAG.getNode(ISD::AND, dl, VT, SRL, 12588 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 12589 } 12590 if (Op.getOpcode() == ISD::SRA) { 12591 if (ShiftAmt == 7) { 12592 // R s>> 7 === R s< 0 12593 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 12594 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 12595 } 12596 12597 // R s>> a === ((R u>> a) ^ m) - m 12598 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 12599 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, 12600 MVT::i8)); 12601 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); 12602 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 12603 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 12604 return Res; 12605 } 12606 llvm_unreachable("Unknown shift opcode."); 12607 } 12608 12609 if (Subtarget->hasInt256() && VT == MVT::v32i8) { 12610 if (Op.getOpcode() == ISD::SHL) { 12611 // Make a large shift. 12612 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, 12613 MVT::v16i16, R, ShiftAmt, 12614 DAG); 12615 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 12616 // Zero out the rightmost bits. 12617 SmallVector<SDValue, 32> V(32, 12618 DAG.getConstant(uint8_t(-1U << ShiftAmt), 12619 MVT::i8)); 12620 return DAG.getNode(ISD::AND, dl, VT, SHL, 12621 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 12622 } 12623 if (Op.getOpcode() == ISD::SRL) { 12624 // Make a large shift. 12625 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, 12626 MVT::v16i16, R, ShiftAmt, 12627 DAG); 12628 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 12629 // Zero out the leftmost bits. 12630 SmallVector<SDValue, 32> V(32, 12631 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 12632 MVT::i8)); 12633 return DAG.getNode(ISD::AND, dl, VT, SRL, 12634 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 12635 } 12636 if (Op.getOpcode() == ISD::SRA) { 12637 if (ShiftAmt == 7) { 12638 // R s>> 7 === R s< 0 12639 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 12640 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 12641 } 12642 12643 // R s>> a === ((R u>> a) ^ m) - m 12644 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 12645 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, 12646 MVT::i8)); 12647 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); 12648 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 12649 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 12650 return Res; 12651 } 12652 llvm_unreachable("Unknown shift opcode."); 12653 } 12654 } 12655 } 12656 12657 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 12658 if (!Subtarget->is64Bit() && 12659 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && 12660 Amt.getOpcode() == ISD::BITCAST && 12661 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 12662 Amt = Amt.getOperand(0); 12663 unsigned Ratio = Amt.getValueType().getVectorNumElements() / 12664 VT.getVectorNumElements(); 12665 unsigned RatioInLog2 = Log2_32_Ceil(Ratio); 12666 uint64_t ShiftAmt = 0; 12667 for (unsigned i = 0; i != Ratio; ++i) { 12668 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); 12669 if (C == 0) 12670 return SDValue(); 12671 // 6 == Log2(64) 12672 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); 12673 } 12674 // Check remaining shift amounts. 12675 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 12676 uint64_t ShAmt = 0; 12677 for (unsigned j = 0; j != Ratio; ++j) { 12678 ConstantSDNode *C = 12679 dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); 12680 if (C == 0) 12681 return SDValue(); 12682 // 6 == Log2(64) 12683 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); 12684 } 12685 if (ShAmt != ShiftAmt) 12686 return SDValue(); 12687 } 12688 switch (Op.getOpcode()) { 12689 default: 12690 llvm_unreachable("Unknown shift opcode!"); 12691 case ISD::SHL: 12692 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, 12693 DAG); 12694 case ISD::SRL: 12695 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, 12696 DAG); 12697 case ISD::SRA: 12698 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, 12699 DAG); 12700 } 12701 } 12702 12703 return SDValue(); 12704} 12705 12706static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, 12707 const X86Subtarget* Subtarget) { 12708 EVT VT = Op.getValueType(); 12709 SDLoc dl(Op); 12710 SDValue R = Op.getOperand(0); 12711 SDValue Amt = Op.getOperand(1); 12712 12713 if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) || 12714 VT == MVT::v4i32 || VT == MVT::v8i16 || 12715 (Subtarget->hasInt256() && 12716 ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || 12717 VT == MVT::v8i32 || VT == MVT::v16i16)) || 12718 (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { 12719 SDValue BaseShAmt; 12720 EVT EltVT = VT.getVectorElementType(); 12721 12722 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 12723 unsigned NumElts = VT.getVectorNumElements(); 12724 unsigned i, j; 12725 for (i = 0; i != NumElts; ++i) { 12726 if (Amt.getOperand(i).getOpcode() == ISD::UNDEF) 12727 continue; 12728 break; 12729 } 12730 for (j = i; j != NumElts; ++j) { 12731 SDValue Arg = Amt.getOperand(j); 12732 if (Arg.getOpcode() == ISD::UNDEF) continue; 12733 if (Arg != Amt.getOperand(i)) 12734 break; 12735 } 12736 if (i != NumElts && j == NumElts) 12737 BaseShAmt = Amt.getOperand(i); 12738 } else { 12739 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) 12740 Amt = Amt.getOperand(0); 12741 if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE && 12742 cast<ShuffleVectorSDNode>(Amt)->isSplat()) { 12743 SDValue InVec = Amt.getOperand(0); 12744 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 12745 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 12746 unsigned i = 0; 12747 for (; i != NumElts; ++i) { 12748 SDValue Arg = InVec.getOperand(i); 12749 if (Arg.getOpcode() == ISD::UNDEF) continue; 12750 BaseShAmt = Arg; 12751 break; 12752 } 12753 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 12754 if (ConstantSDNode *C = 12755 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 12756 unsigned SplatIdx = 12757 cast<ShuffleVectorSDNode>(Amt)->getSplatIndex(); 12758 if (C->getZExtValue() == SplatIdx) 12759 BaseShAmt = InVec.getOperand(1); 12760 } 12761 } 12762 if (BaseShAmt.getNode() == 0) 12763 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, 12764 DAG.getIntPtrConstant(0)); 12765 } 12766 } 12767 12768 if (BaseShAmt.getNode()) { 12769 if (EltVT.bitsGT(MVT::i32)) 12770 BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); 12771 else if (EltVT.bitsLT(MVT::i32)) 12772 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); 12773 12774 switch (Op.getOpcode()) { 12775 default: 12776 llvm_unreachable("Unknown shift opcode!"); 12777 case ISD::SHL: 12778 switch (VT.getSimpleVT().SimpleTy) { 12779 default: return SDValue(); 12780 case MVT::v2i64: 12781 case MVT::v4i32: 12782 case MVT::v8i16: 12783 case MVT::v4i64: 12784 case MVT::v8i32: 12785 case MVT::v16i16: 12786 case MVT::v16i32: 12787 case MVT::v8i64: 12788 return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); 12789 } 12790 case ISD::SRA: 12791 switch (VT.getSimpleVT().SimpleTy) { 12792 default: return SDValue(); 12793 case MVT::v4i32: 12794 case MVT::v8i16: 12795 case MVT::v8i32: 12796 case MVT::v16i16: 12797 case MVT::v16i32: 12798 case MVT::v8i64: 12799 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); 12800 } 12801 case ISD::SRL: 12802 switch (VT.getSimpleVT().SimpleTy) { 12803 default: return SDValue(); 12804 case MVT::v2i64: 12805 case MVT::v4i32: 12806 case MVT::v8i16: 12807 case MVT::v4i64: 12808 case MVT::v8i32: 12809 case MVT::v16i16: 12810 case MVT::v16i32: 12811 case MVT::v8i64: 12812 return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); 12813 } 12814 } 12815 } 12816 } 12817 12818 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 12819 if (!Subtarget->is64Bit() && 12820 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || 12821 (Subtarget->hasAVX512() && VT == MVT::v8i64)) && 12822 Amt.getOpcode() == ISD::BITCAST && 12823 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 12824 Amt = Amt.getOperand(0); 12825 unsigned Ratio = Amt.getValueType().getVectorNumElements() / 12826 VT.getVectorNumElements(); 12827 std::vector<SDValue> Vals(Ratio); 12828 for (unsigned i = 0; i != Ratio; ++i) 12829 Vals[i] = Amt.getOperand(i); 12830 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 12831 for (unsigned j = 0; j != Ratio; ++j) 12832 if (Vals[j] != Amt.getOperand(i + j)) 12833 return SDValue(); 12834 } 12835 switch (Op.getOpcode()) { 12836 default: 12837 llvm_unreachable("Unknown shift opcode!"); 12838 case ISD::SHL: 12839 return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1)); 12840 case ISD::SRL: 12841 return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1)); 12842 case ISD::SRA: 12843 return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1)); 12844 } 12845 } 12846 12847 return SDValue(); 12848} 12849 12850static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, 12851 SelectionDAG &DAG) { 12852 12853 EVT VT = Op.getValueType(); 12854 SDLoc dl(Op); 12855 SDValue R = Op.getOperand(0); 12856 SDValue Amt = Op.getOperand(1); 12857 SDValue V; 12858 12859 if (!Subtarget->hasSSE2()) 12860 return SDValue(); 12861 12862 V = LowerScalarImmediateShift(Op, DAG, Subtarget); 12863 if (V.getNode()) 12864 return V; 12865 12866 V = LowerScalarVariableShift(Op, DAG, Subtarget); 12867 if (V.getNode()) 12868 return V; 12869 12870 if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) 12871 return Op; 12872 // AVX2 has VPSLLV/VPSRAV/VPSRLV. 12873 if (Subtarget->hasInt256()) { 12874 if (Op.getOpcode() == ISD::SRL && 12875 (VT == MVT::v2i64 || VT == MVT::v4i32 || 12876 VT == MVT::v4i64 || VT == MVT::v8i32)) 12877 return Op; 12878 if (Op.getOpcode() == ISD::SHL && 12879 (VT == MVT::v2i64 || VT == MVT::v4i32 || 12880 VT == MVT::v4i64 || VT == MVT::v8i32)) 12881 return Op; 12882 if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32)) 12883 return Op; 12884 } 12885 12886 // Lower SHL with variable shift amount. 12887 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 12888 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); 12889 12890 Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT)); 12891 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 12892 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 12893 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 12894 } 12895 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 12896 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); 12897 12898 // a = a << 5; 12899 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT)); 12900 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); 12901 12902 // Turn 'a' into a mask suitable for VSELECT 12903 SDValue VSelM = DAG.getConstant(0x80, VT); 12904 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 12905 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 12906 12907 SDValue CM1 = DAG.getConstant(0x0f, VT); 12908 SDValue CM2 = DAG.getConstant(0x3f, VT); 12909 12910 // r = VSELECT(r, psllw(r & (char16)15, 4), a); 12911 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); 12912 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG); 12913 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 12914 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 12915 12916 // a += a 12917 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 12918 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 12919 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 12920 12921 // r = VSELECT(r, psllw(r & (char16)63, 2), a); 12922 M = DAG.getNode(ISD::AND, dl, VT, R, CM2); 12923 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG); 12924 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 12925 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 12926 12927 // a += a 12928 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 12929 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 12930 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 12931 12932 // return VSELECT(r, r+r, a); 12933 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, 12934 DAG.getNode(ISD::ADD, dl, VT, R, R), R); 12935 return R; 12936 } 12937 12938 // Decompose 256-bit shifts into smaller 128-bit shifts. 12939 if (VT.is256BitVector()) { 12940 unsigned NumElems = VT.getVectorNumElements(); 12941 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 12942 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 12943 12944 // Extract the two vectors 12945 SDValue V1 = Extract128BitVector(R, 0, DAG, dl); 12946 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); 12947 12948 // Recreate the shift amount vectors 12949 SDValue Amt1, Amt2; 12950 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 12951 // Constant shift amount 12952 SmallVector<SDValue, 4> Amt1Csts; 12953 SmallVector<SDValue, 4> Amt2Csts; 12954 for (unsigned i = 0; i != NumElems/2; ++i) 12955 Amt1Csts.push_back(Amt->getOperand(i)); 12956 for (unsigned i = NumElems/2; i != NumElems; ++i) 12957 Amt2Csts.push_back(Amt->getOperand(i)); 12958 12959 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 12960 &Amt1Csts[0], NumElems/2); 12961 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 12962 &Amt2Csts[0], NumElems/2); 12963 } else { 12964 // Variable shift amount 12965 Amt1 = Extract128BitVector(Amt, 0, DAG, dl); 12966 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); 12967 } 12968 12969 // Issue new vector shifts for the smaller types 12970 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 12971 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 12972 12973 // Concatenate the result back 12974 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 12975 } 12976 12977 return SDValue(); 12978} 12979 12980static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 12981 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 12982 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 12983 // looks for this combo and may remove the "setcc" instruction if the "setcc" 12984 // has only one use. 12985 SDNode *N = Op.getNode(); 12986 SDValue LHS = N->getOperand(0); 12987 SDValue RHS = N->getOperand(1); 12988 unsigned BaseOp = 0; 12989 unsigned Cond = 0; 12990 SDLoc DL(Op); 12991 switch (Op.getOpcode()) { 12992 default: llvm_unreachable("Unknown ovf instruction!"); 12993 case ISD::SADDO: 12994 // A subtract of one will be selected as a INC. Note that INC doesn't 12995 // set CF, so we can't do this for UADDO. 12996 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 12997 if (C->isOne()) { 12998 BaseOp = X86ISD::INC; 12999 Cond = X86::COND_O; 13000 break; 13001 } 13002 BaseOp = X86ISD::ADD; 13003 Cond = X86::COND_O; 13004 break; 13005 case ISD::UADDO: 13006 BaseOp = X86ISD::ADD; 13007 Cond = X86::COND_B; 13008 break; 13009 case ISD::SSUBO: 13010 // A subtract of one will be selected as a DEC. Note that DEC doesn't 13011 // set CF, so we can't do this for USUBO. 13012 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 13013 if (C->isOne()) { 13014 BaseOp = X86ISD::DEC; 13015 Cond = X86::COND_O; 13016 break; 13017 } 13018 BaseOp = X86ISD::SUB; 13019 Cond = X86::COND_O; 13020 break; 13021 case ISD::USUBO: 13022 BaseOp = X86ISD::SUB; 13023 Cond = X86::COND_B; 13024 break; 13025 case ISD::SMULO: 13026 BaseOp = X86ISD::SMUL; 13027 Cond = X86::COND_O; 13028 break; 13029 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 13030 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 13031 MVT::i32); 13032 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 13033 13034 SDValue SetCC = 13035 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 13036 DAG.getConstant(X86::COND_O, MVT::i32), 13037 SDValue(Sum.getNode(), 2)); 13038 13039 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 13040 } 13041 } 13042 13043 // Also sets EFLAGS. 13044 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 13045 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 13046 13047 SDValue SetCC = 13048 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 13049 DAG.getConstant(Cond, MVT::i32), 13050 SDValue(Sum.getNode(), 1)); 13051 13052 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 13053} 13054 13055SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 13056 SelectionDAG &DAG) const { 13057 SDLoc dl(Op); 13058 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 13059 EVT VT = Op.getValueType(); 13060 13061 if (!Subtarget->hasSSE2() || !VT.isVector()) 13062 return SDValue(); 13063 13064 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 13065 ExtraVT.getScalarType().getSizeInBits(); 13066 13067 switch (VT.getSimpleVT().SimpleTy) { 13068 default: return SDValue(); 13069 case MVT::v8i32: 13070 case MVT::v16i16: 13071 if (!Subtarget->hasFp256()) 13072 return SDValue(); 13073 if (!Subtarget->hasInt256()) { 13074 // needs to be split 13075 unsigned NumElems = VT.getVectorNumElements(); 13076 13077 // Extract the LHS vectors 13078 SDValue LHS = Op.getOperand(0); 13079 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 13080 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 13081 13082 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 13083 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 13084 13085 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 13086 unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); 13087 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, 13088 ExtraNumElems/2); 13089 SDValue Extra = DAG.getValueType(ExtraVT); 13090 13091 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); 13092 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); 13093 13094 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); 13095 } 13096 // fall through 13097 case MVT::v4i32: 13098 case MVT::v8i16: { 13099 // (sext (vzext x)) -> (vsext x) 13100 SDValue Op0 = Op.getOperand(0); 13101 SDValue Op00 = Op0.getOperand(0); 13102 SDValue Tmp1; 13103 // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. 13104 if (Op0.getOpcode() == ISD::BITCAST && 13105 Op00.getOpcode() == ISD::VECTOR_SHUFFLE) 13106 Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG); 13107 if (Tmp1.getNode()) { 13108 SDValue Tmp1Op0 = Tmp1.getOperand(0); 13109 assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && 13110 "This optimization is invalid without a VZEXT."); 13111 return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); 13112 } 13113 13114 // If the above didn't work, then just use Shift-Left + Shift-Right. 13115 Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff, 13116 DAG); 13117 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff, 13118 DAG); 13119 } 13120 } 13121} 13122 13123static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, 13124 SelectionDAG &DAG) { 13125 SDLoc dl(Op); 13126 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 13127 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 13128 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 13129 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 13130 13131 // The only fence that needs an instruction is a sequentially-consistent 13132 // cross-thread fence. 13133 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 13134 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 13135 // no-sse2). There isn't any reason to disable it if the target processor 13136 // supports it. 13137 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 13138 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 13139 13140 SDValue Chain = Op.getOperand(0); 13141 SDValue Zero = DAG.getConstant(0, MVT::i32); 13142 SDValue Ops[] = { 13143 DAG.getRegister(X86::ESP, MVT::i32), // Base 13144 DAG.getTargetConstant(1, MVT::i8), // Scale 13145 DAG.getRegister(0, MVT::i32), // Index 13146 DAG.getTargetConstant(0, MVT::i32), // Disp 13147 DAG.getRegister(0, MVT::i32), // Segment. 13148 Zero, 13149 Chain 13150 }; 13151 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); 13152 return SDValue(Res, 0); 13153 } 13154 13155 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 13156 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 13157} 13158 13159static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, 13160 SelectionDAG &DAG) { 13161 EVT T = Op.getValueType(); 13162 SDLoc DL(Op); 13163 unsigned Reg = 0; 13164 unsigned size = 0; 13165 switch(T.getSimpleVT().SimpleTy) { 13166 default: llvm_unreachable("Invalid value type!"); 13167 case MVT::i8: Reg = X86::AL; size = 1; break; 13168 case MVT::i16: Reg = X86::AX; size = 2; break; 13169 case MVT::i32: Reg = X86::EAX; size = 4; break; 13170 case MVT::i64: 13171 assert(Subtarget->is64Bit() && "Node not type legal!"); 13172 Reg = X86::RAX; size = 8; 13173 break; 13174 } 13175 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 13176 Op.getOperand(2), SDValue()); 13177 SDValue Ops[] = { cpIn.getValue(0), 13178 Op.getOperand(1), 13179 Op.getOperand(3), 13180 DAG.getTargetConstant(size, MVT::i8), 13181 cpIn.getValue(1) }; 13182 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13183 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 13184 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 13185 Ops, array_lengthof(Ops), T, MMO); 13186 SDValue cpOut = 13187 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 13188 return cpOut; 13189} 13190 13191static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, 13192 SelectionDAG &DAG) { 13193 assert(Subtarget->is64Bit() && "Result not type legalized?"); 13194 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13195 SDValue TheChain = Op.getOperand(0); 13196 SDLoc dl(Op); 13197 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 13198 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 13199 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 13200 rax.getValue(2)); 13201 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 13202 DAG.getConstant(32, MVT::i8)); 13203 SDValue Ops[] = { 13204 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 13205 rdx.getValue(1) 13206 }; 13207 return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); 13208} 13209 13210static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, 13211 SelectionDAG &DAG) { 13212 MVT SrcVT = Op.getOperand(0).getSimpleValueType(); 13213 MVT DstVT = Op.getSimpleValueType(); 13214 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 13215 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 13216 assert((DstVT == MVT::i64 || 13217 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 13218 "Unexpected custom BITCAST"); 13219 // i64 <=> MMX conversions are Legal. 13220 if (SrcVT==MVT::i64 && DstVT.isVector()) 13221 return Op; 13222 if (DstVT==MVT::i64 && SrcVT.isVector()) 13223 return Op; 13224 // MMX <=> MMX conversions are Legal. 13225 if (SrcVT.isVector() && DstVT.isVector()) 13226 return Op; 13227 // All other conversions need to be expanded. 13228 return SDValue(); 13229} 13230 13231static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 13232 SDNode *Node = Op.getNode(); 13233 SDLoc dl(Node); 13234 EVT T = Node->getValueType(0); 13235 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 13236 DAG.getConstant(0, T), Node->getOperand(2)); 13237 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 13238 cast<AtomicSDNode>(Node)->getMemoryVT(), 13239 Node->getOperand(0), 13240 Node->getOperand(1), negOp, 13241 cast<AtomicSDNode>(Node)->getSrcValue(), 13242 cast<AtomicSDNode>(Node)->getAlignment(), 13243 cast<AtomicSDNode>(Node)->getOrdering(), 13244 cast<AtomicSDNode>(Node)->getSynchScope()); 13245} 13246 13247static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 13248 SDNode *Node = Op.getNode(); 13249 SDLoc dl(Node); 13250 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 13251 13252 // Convert seq_cst store -> xchg 13253 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 13254 // FIXME: On 32-bit, store -> fist or movq would be more efficient 13255 // (The only way to get a 16-byte store is cmpxchg16b) 13256 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 13257 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 13258 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 13259 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 13260 cast<AtomicSDNode>(Node)->getMemoryVT(), 13261 Node->getOperand(0), 13262 Node->getOperand(1), Node->getOperand(2), 13263 cast<AtomicSDNode>(Node)->getMemOperand(), 13264 cast<AtomicSDNode>(Node)->getOrdering(), 13265 cast<AtomicSDNode>(Node)->getSynchScope()); 13266 return Swap.getValue(1); 13267 } 13268 // Other atomic stores have a simple pattern. 13269 return Op; 13270} 13271 13272static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 13273 EVT VT = Op.getNode()->getValueType(0); 13274 13275 // Let legalize expand this if it isn't a legal type yet. 13276 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13277 return SDValue(); 13278 13279 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 13280 13281 unsigned Opc; 13282 bool ExtraOp = false; 13283 switch (Op.getOpcode()) { 13284 default: llvm_unreachable("Invalid code"); 13285 case ISD::ADDC: Opc = X86ISD::ADD; break; 13286 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 13287 case ISD::SUBC: Opc = X86ISD::SUB; break; 13288 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 13289 } 13290 13291 if (!ExtraOp) 13292 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 13293 Op.getOperand(1)); 13294 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 13295 Op.getOperand(1), Op.getOperand(2)); 13296} 13297 13298static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, 13299 SelectionDAG &DAG) { 13300 assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); 13301 13302 // For MacOSX, we want to call an alternative entry point: __sincos_stret, 13303 // which returns the values as { float, float } (in XMM0) or 13304 // { double, double } (which is returned in XMM0, XMM1). 13305 SDLoc dl(Op); 13306 SDValue Arg = Op.getOperand(0); 13307 EVT ArgVT = Arg.getValueType(); 13308 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 13309 13310 TargetLowering::ArgListTy Args; 13311 TargetLowering::ArgListEntry Entry; 13312 13313 Entry.Node = Arg; 13314 Entry.Ty = ArgTy; 13315 Entry.isSExt = false; 13316 Entry.isZExt = false; 13317 Args.push_back(Entry); 13318 13319 bool isF64 = ArgVT == MVT::f64; 13320 // Only optimize x86_64 for now. i386 is a bit messy. For f32, 13321 // the small struct {f32, f32} is returned in (eax, edx). For f64, 13322 // the results are returned via SRet in memory. 13323 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; 13324 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13325 SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); 13326 13327 Type *RetTy = isF64 13328 ? (Type*)StructType::get(ArgTy, ArgTy, NULL) 13329 : (Type*)VectorType::get(ArgTy, 4); 13330 TargetLowering:: 13331 CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, 13332 false, false, false, false, 0, 13333 CallingConv::C, /*isTaillCall=*/false, 13334 /*doesNotRet=*/false, /*isReturnValueUsed*/true, 13335 Callee, Args, DAG, dl); 13336 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); 13337 13338 if (isF64) 13339 // Returned in xmm0 and xmm1. 13340 return CallResult.first; 13341 13342 // Returned in bits 0:31 and 32:64 xmm0. 13343 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 13344 CallResult.first, DAG.getIntPtrConstant(0)); 13345 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 13346 CallResult.first, DAG.getIntPtrConstant(1)); 13347 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 13348 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); 13349} 13350 13351/// LowerOperation - Provide custom lowering hooks for some operations. 13352/// 13353SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 13354 switch (Op.getOpcode()) { 13355 default: llvm_unreachable("Should not custom lower this!"); 13356 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 13357 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); 13358 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); 13359 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 13360 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 13361 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 13362 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 13363 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 13364 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 13365 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 13366 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); 13367 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); 13368 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 13369 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 13370 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 13371 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 13372 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 13373 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 13374 case ISD::SHL_PARTS: 13375 case ISD::SRA_PARTS: 13376 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 13377 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 13378 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 13379 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 13380 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); 13381 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); 13382 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); 13383 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 13384 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 13385 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 13386 case ISD::FABS: return LowerFABS(Op, DAG); 13387 case ISD::FNEG: return LowerFNEG(Op, DAG); 13388 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 13389 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 13390 case ISD::SETCC: return LowerSETCC(Op, DAG); 13391 case ISD::SELECT: return LowerSELECT(Op, DAG); 13392 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 13393 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 13394 case ISD::VASTART: return LowerVASTART(Op, DAG); 13395 case ISD::VAARG: return LowerVAARG(Op, DAG); 13396 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); 13397 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 13398 case ISD::INTRINSIC_VOID: 13399 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); 13400 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 13401 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 13402 case ISD::FRAME_TO_ARGS_OFFSET: 13403 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 13404 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 13405 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 13406 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 13407 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 13408 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 13409 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 13410 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 13411 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 13412 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); 13413 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 13414 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); 13415 case ISD::SRA: 13416 case ISD::SRL: 13417 case ISD::SHL: return LowerShift(Op, Subtarget, DAG); 13418 case ISD::SADDO: 13419 case ISD::UADDO: 13420 case ISD::SSUBO: 13421 case ISD::USUBO: 13422 case ISD::SMULO: 13423 case ISD::UMULO: return LowerXALUO(Op, DAG); 13424 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); 13425 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); 13426 case ISD::ADDC: 13427 case ISD::ADDE: 13428 case ISD::SUBC: 13429 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 13430 case ISD::ADD: return LowerADD(Op, DAG); 13431 case ISD::SUB: return LowerSUB(Op, DAG); 13432 case ISD::SDIV: return LowerSDIV(Op, DAG); 13433 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); 13434 } 13435} 13436 13437static void ReplaceATOMIC_LOAD(SDNode *Node, 13438 SmallVectorImpl<SDValue> &Results, 13439 SelectionDAG &DAG) { 13440 SDLoc dl(Node); 13441 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 13442 13443 // Convert wide load -> cmpxchg8b/cmpxchg16b 13444 // FIXME: On 32-bit, load -> fild or movq would be more efficient 13445 // (The only way to get a 16-byte load is cmpxchg16b) 13446 // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. 13447 SDValue Zero = DAG.getConstant(0, VT); 13448 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, 13449 Node->getOperand(0), 13450 Node->getOperand(1), Zero, Zero, 13451 cast<AtomicSDNode>(Node)->getMemOperand(), 13452 cast<AtomicSDNode>(Node)->getOrdering(), 13453 cast<AtomicSDNode>(Node)->getSynchScope()); 13454 Results.push_back(Swap.getValue(0)); 13455 Results.push_back(Swap.getValue(1)); 13456} 13457 13458static void 13459ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 13460 SelectionDAG &DAG, unsigned NewOp) { 13461 SDLoc dl(Node); 13462 assert (Node->getValueType(0) == MVT::i64 && 13463 "Only know how to expand i64 atomics"); 13464 13465 SDValue Chain = Node->getOperand(0); 13466 SDValue In1 = Node->getOperand(1); 13467 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 13468 Node->getOperand(2), DAG.getIntPtrConstant(0)); 13469 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 13470 Node->getOperand(2), DAG.getIntPtrConstant(1)); 13471 SDValue Ops[] = { Chain, In1, In2L, In2H }; 13472 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 13473 SDValue Result = 13474 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64, 13475 cast<MemSDNode>(Node)->getMemOperand()); 13476 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 13477 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 13478 Results.push_back(Result.getValue(2)); 13479} 13480 13481/// ReplaceNodeResults - Replace a node with an illegal result type 13482/// with a new node built out of custom code. 13483void X86TargetLowering::ReplaceNodeResults(SDNode *N, 13484 SmallVectorImpl<SDValue>&Results, 13485 SelectionDAG &DAG) const { 13486 SDLoc dl(N); 13487 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13488 switch (N->getOpcode()) { 13489 default: 13490 llvm_unreachable("Do not know how to custom type legalize this operation!"); 13491 case ISD::SIGN_EXTEND_INREG: 13492 case ISD::ADDC: 13493 case ISD::ADDE: 13494 case ISD::SUBC: 13495 case ISD::SUBE: 13496 // We don't want to expand or promote these. 13497 return; 13498 case ISD::FP_TO_SINT: 13499 case ISD::FP_TO_UINT: { 13500 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 13501 13502 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) 13503 return; 13504 13505 std::pair<SDValue,SDValue> Vals = 13506 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 13507 SDValue FIST = Vals.first, StackSlot = Vals.second; 13508 if (FIST.getNode() != 0) { 13509 EVT VT = N->getValueType(0); 13510 // Return a load from the stack slot. 13511 if (StackSlot.getNode() != 0) 13512 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 13513 MachinePointerInfo(), 13514 false, false, false, 0)); 13515 else 13516 Results.push_back(FIST); 13517 } 13518 return; 13519 } 13520 case ISD::UINT_TO_FP: { 13521 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 13522 if (N->getOperand(0).getValueType() != MVT::v2i32 || 13523 N->getValueType(0) != MVT::v2f32) 13524 return; 13525 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, 13526 N->getOperand(0)); 13527 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 13528 MVT::f64); 13529 SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); 13530 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, 13531 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); 13532 Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); 13533 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); 13534 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); 13535 return; 13536 } 13537 case ISD::FP_ROUND: { 13538 if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) 13539 return; 13540 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); 13541 Results.push_back(V); 13542 return; 13543 } 13544 case ISD::READCYCLECOUNTER: { 13545 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13546 SDValue TheChain = N->getOperand(0); 13547 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 13548 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 13549 rd.getValue(1)); 13550 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 13551 eax.getValue(2)); 13552 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 13553 SDValue Ops[] = { eax, edx }; 13554 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 13555 array_lengthof(Ops))); 13556 Results.push_back(edx.getValue(1)); 13557 return; 13558 } 13559 case ISD::ATOMIC_CMP_SWAP: { 13560 EVT T = N->getValueType(0); 13561 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 13562 bool Regs64bit = T == MVT::i128; 13563 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 13564 SDValue cpInL, cpInH; 13565 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 13566 DAG.getConstant(0, HalfT)); 13567 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 13568 DAG.getConstant(1, HalfT)); 13569 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 13570 Regs64bit ? X86::RAX : X86::EAX, 13571 cpInL, SDValue()); 13572 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 13573 Regs64bit ? X86::RDX : X86::EDX, 13574 cpInH, cpInL.getValue(1)); 13575 SDValue swapInL, swapInH; 13576 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 13577 DAG.getConstant(0, HalfT)); 13578 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 13579 DAG.getConstant(1, HalfT)); 13580 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 13581 Regs64bit ? X86::RBX : X86::EBX, 13582 swapInL, cpInH.getValue(1)); 13583 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 13584 Regs64bit ? X86::RCX : X86::ECX, 13585 swapInH, swapInL.getValue(1)); 13586 SDValue Ops[] = { swapInH.getValue(0), 13587 N->getOperand(1), 13588 swapInH.getValue(1) }; 13589 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 13590 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 13591 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 13592 X86ISD::LCMPXCHG8_DAG; 13593 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, 13594 Ops, array_lengthof(Ops), T, MMO); 13595 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 13596 Regs64bit ? X86::RAX : X86::EAX, 13597 HalfT, Result.getValue(1)); 13598 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 13599 Regs64bit ? X86::RDX : X86::EDX, 13600 HalfT, cpOutL.getValue(2)); 13601 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 13602 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); 13603 Results.push_back(cpOutH.getValue(1)); 13604 return; 13605 } 13606 case ISD::ATOMIC_LOAD_ADD: 13607 case ISD::ATOMIC_LOAD_AND: 13608 case ISD::ATOMIC_LOAD_NAND: 13609 case ISD::ATOMIC_LOAD_OR: 13610 case ISD::ATOMIC_LOAD_SUB: 13611 case ISD::ATOMIC_LOAD_XOR: 13612 case ISD::ATOMIC_LOAD_MAX: 13613 case ISD::ATOMIC_LOAD_MIN: 13614 case ISD::ATOMIC_LOAD_UMAX: 13615 case ISD::ATOMIC_LOAD_UMIN: 13616 case ISD::ATOMIC_SWAP: { 13617 unsigned Opc; 13618 switch (N->getOpcode()) { 13619 default: llvm_unreachable("Unexpected opcode"); 13620 case ISD::ATOMIC_LOAD_ADD: 13621 Opc = X86ISD::ATOMADD64_DAG; 13622 break; 13623 case ISD::ATOMIC_LOAD_AND: 13624 Opc = X86ISD::ATOMAND64_DAG; 13625 break; 13626 case ISD::ATOMIC_LOAD_NAND: 13627 Opc = X86ISD::ATOMNAND64_DAG; 13628 break; 13629 case ISD::ATOMIC_LOAD_OR: 13630 Opc = X86ISD::ATOMOR64_DAG; 13631 break; 13632 case ISD::ATOMIC_LOAD_SUB: 13633 Opc = X86ISD::ATOMSUB64_DAG; 13634 break; 13635 case ISD::ATOMIC_LOAD_XOR: 13636 Opc = X86ISD::ATOMXOR64_DAG; 13637 break; 13638 case ISD::ATOMIC_LOAD_MAX: 13639 Opc = X86ISD::ATOMMAX64_DAG; 13640 break; 13641 case ISD::ATOMIC_LOAD_MIN: 13642 Opc = X86ISD::ATOMMIN64_DAG; 13643 break; 13644 case ISD::ATOMIC_LOAD_UMAX: 13645 Opc = X86ISD::ATOMUMAX64_DAG; 13646 break; 13647 case ISD::ATOMIC_LOAD_UMIN: 13648 Opc = X86ISD::ATOMUMIN64_DAG; 13649 break; 13650 case ISD::ATOMIC_SWAP: 13651 Opc = X86ISD::ATOMSWAP64_DAG; 13652 break; 13653 } 13654 ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); 13655 return; 13656 } 13657 case ISD::ATOMIC_LOAD: 13658 ReplaceATOMIC_LOAD(N, Results, DAG); 13659 } 13660} 13661 13662const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 13663 switch (Opcode) { 13664 default: return NULL; 13665 case X86ISD::BSF: return "X86ISD::BSF"; 13666 case X86ISD::BSR: return "X86ISD::BSR"; 13667 case X86ISD::SHLD: return "X86ISD::SHLD"; 13668 case X86ISD::SHRD: return "X86ISD::SHRD"; 13669 case X86ISD::FAND: return "X86ISD::FAND"; 13670 case X86ISD::FANDN: return "X86ISD::FANDN"; 13671 case X86ISD::FOR: return "X86ISD::FOR"; 13672 case X86ISD::FXOR: return "X86ISD::FXOR"; 13673 case X86ISD::FSRL: return "X86ISD::FSRL"; 13674 case X86ISD::FILD: return "X86ISD::FILD"; 13675 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 13676 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 13677 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 13678 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 13679 case X86ISD::FLD: return "X86ISD::FLD"; 13680 case X86ISD::FST: return "X86ISD::FST"; 13681 case X86ISD::CALL: return "X86ISD::CALL"; 13682 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 13683 case X86ISD::BT: return "X86ISD::BT"; 13684 case X86ISD::CMP: return "X86ISD::CMP"; 13685 case X86ISD::COMI: return "X86ISD::COMI"; 13686 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 13687 case X86ISD::CMPM: return "X86ISD::CMPM"; 13688 case X86ISD::CMPMU: return "X86ISD::CMPMU"; 13689 case X86ISD::SETCC: return "X86ISD::SETCC"; 13690 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 13691 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 13692 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 13693 case X86ISD::CMOV: return "X86ISD::CMOV"; 13694 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 13695 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 13696 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 13697 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 13698 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 13699 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 13700 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 13701 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 13702 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 13703 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 13704 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 13705 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 13706 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 13707 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 13708 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 13709 case X86ISD::BLENDV: return "X86ISD::BLENDV"; 13710 case X86ISD::BLENDI: return "X86ISD::BLENDI"; 13711 case X86ISD::SUBUS: return "X86ISD::SUBUS"; 13712 case X86ISD::HADD: return "X86ISD::HADD"; 13713 case X86ISD::HSUB: return "X86ISD::HSUB"; 13714 case X86ISD::FHADD: return "X86ISD::FHADD"; 13715 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 13716 case X86ISD::UMAX: return "X86ISD::UMAX"; 13717 case X86ISD::UMIN: return "X86ISD::UMIN"; 13718 case X86ISD::SMAX: return "X86ISD::SMAX"; 13719 case X86ISD::SMIN: return "X86ISD::SMIN"; 13720 case X86ISD::FMAX: return "X86ISD::FMAX"; 13721 case X86ISD::FMIN: return "X86ISD::FMIN"; 13722 case X86ISD::FMAXC: return "X86ISD::FMAXC"; 13723 case X86ISD::FMINC: return "X86ISD::FMINC"; 13724 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 13725 case X86ISD::FRCP: return "X86ISD::FRCP"; 13726 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 13727 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; 13728 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 13729 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; 13730 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; 13731 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 13732 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 13733 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 13734 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; 13735 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 13736 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 13737 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 13738 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 13739 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 13740 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 13741 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 13742 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 13743 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 13744 case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; 13745 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 13746 case X86ISD::VZEXT: return "X86ISD::VZEXT"; 13747 case X86ISD::VSEXT: return "X86ISD::VSEXT"; 13748 case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; 13749 case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; 13750 case X86ISD::VINSERT: return "X86ISD::VINSERT"; 13751 case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; 13752 case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; 13753 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 13754 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 13755 case X86ISD::VSHL: return "X86ISD::VSHL"; 13756 case X86ISD::VSRL: return "X86ISD::VSRL"; 13757 case X86ISD::VSRA: return "X86ISD::VSRA"; 13758 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 13759 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 13760 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 13761 case X86ISD::CMPP: return "X86ISD::CMPP"; 13762 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 13763 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 13764 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; 13765 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; 13766 case X86ISD::ADD: return "X86ISD::ADD"; 13767 case X86ISD::SUB: return "X86ISD::SUB"; 13768 case X86ISD::ADC: return "X86ISD::ADC"; 13769 case X86ISD::SBB: return "X86ISD::SBB"; 13770 case X86ISD::SMUL: return "X86ISD::SMUL"; 13771 case X86ISD::UMUL: return "X86ISD::UMUL"; 13772 case X86ISD::INC: return "X86ISD::INC"; 13773 case X86ISD::DEC: return "X86ISD::DEC"; 13774 case X86ISD::OR: return "X86ISD::OR"; 13775 case X86ISD::XOR: return "X86ISD::XOR"; 13776 case X86ISD::AND: return "X86ISD::AND"; 13777 case X86ISD::BLSI: return "X86ISD::BLSI"; 13778 case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; 13779 case X86ISD::BLSR: return "X86ISD::BLSR"; 13780 case X86ISD::BZHI: return "X86ISD::BZHI"; 13781 case X86ISD::BEXTR: return "X86ISD::BEXTR"; 13782 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 13783 case X86ISD::PTEST: return "X86ISD::PTEST"; 13784 case X86ISD::TESTP: return "X86ISD::TESTP"; 13785 case X86ISD::TESTM: return "X86ISD::TESTM"; 13786 case X86ISD::KORTEST: return "X86ISD::KORTEST"; 13787 case X86ISD::KTEST: return "X86ISD::KTEST"; 13788 case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; 13789 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 13790 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 13791 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 13792 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 13793 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 13794 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 13795 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 13796 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 13797 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 13798 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 13799 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 13800 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 13801 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 13802 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 13803 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 13804 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 13805 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 13806 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; 13807 case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; 13808 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 13809 case X86ISD::VPERMV: return "X86ISD::VPERMV"; 13810 case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; 13811 case X86ISD::VPERMI: return "X86ISD::VPERMI"; 13812 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 13813 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 13814 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 13815 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 13816 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 13817 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 13818 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; 13819 case X86ISD::SAHF: return "X86ISD::SAHF"; 13820 case X86ISD::RDRAND: return "X86ISD::RDRAND"; 13821 case X86ISD::RDSEED: return "X86ISD::RDSEED"; 13822 case X86ISD::FMADD: return "X86ISD::FMADD"; 13823 case X86ISD::FMSUB: return "X86ISD::FMSUB"; 13824 case X86ISD::FNMADD: return "X86ISD::FNMADD"; 13825 case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; 13826 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; 13827 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; 13828 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; 13829 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; 13830 case X86ISD::XTEST: return "X86ISD::XTEST"; 13831 } 13832} 13833 13834// isLegalAddressingMode - Return true if the addressing mode represented 13835// by AM is legal for this target, for a load/store of the specified type. 13836bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 13837 Type *Ty) const { 13838 // X86 supports extremely general addressing modes. 13839 CodeModel::Model M = getTargetMachine().getCodeModel(); 13840 Reloc::Model R = getTargetMachine().getRelocationModel(); 13841 13842 // X86 allows a sign-extended 32-bit immediate field as a displacement. 13843 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 13844 return false; 13845 13846 if (AM.BaseGV) { 13847 unsigned GVFlags = 13848 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 13849 13850 // If a reference to this global requires an extra load, we can't fold it. 13851 if (isGlobalStubReference(GVFlags)) 13852 return false; 13853 13854 // If BaseGV requires a register for the PIC base, we cannot also have a 13855 // BaseReg specified. 13856 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 13857 return false; 13858 13859 // If lower 4G is not available, then we must use rip-relative addressing. 13860 if ((M != CodeModel::Small || R != Reloc::Static) && 13861 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 13862 return false; 13863 } 13864 13865 switch (AM.Scale) { 13866 case 0: 13867 case 1: 13868 case 2: 13869 case 4: 13870 case 8: 13871 // These scales always work. 13872 break; 13873 case 3: 13874 case 5: 13875 case 9: 13876 // These scales are formed with basereg+scalereg. Only accept if there is 13877 // no basereg yet. 13878 if (AM.HasBaseReg) 13879 return false; 13880 break; 13881 default: // Other stuff never works. 13882 return false; 13883 } 13884 13885 return true; 13886} 13887 13888bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 13889 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13890 return false; 13891 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 13892 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 13893 return NumBits1 > NumBits2; 13894} 13895 13896bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 13897 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13898 return false; 13899 13900 if (!isTypeLegal(EVT::getEVT(Ty1))) 13901 return false; 13902 13903 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 13904 13905 // Assuming the caller doesn't have a zeroext or signext return parameter, 13906 // truncation all the way down to i1 is valid. 13907 return true; 13908} 13909 13910bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { 13911 return isInt<32>(Imm); 13912} 13913 13914bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { 13915 // Can also use sub to handle negated immediates. 13916 return isInt<32>(Imm); 13917} 13918 13919bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 13920 if (!VT1.isInteger() || !VT2.isInteger()) 13921 return false; 13922 unsigned NumBits1 = VT1.getSizeInBits(); 13923 unsigned NumBits2 = VT2.getSizeInBits(); 13924 return NumBits1 > NumBits2; 13925} 13926 13927bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 13928 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 13929 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 13930} 13931 13932bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 13933 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 13934 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 13935} 13936 13937bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 13938 EVT VT1 = Val.getValueType(); 13939 if (isZExtFree(VT1, VT2)) 13940 return true; 13941 13942 if (Val.getOpcode() != ISD::LOAD) 13943 return false; 13944 13945 if (!VT1.isSimple() || !VT1.isInteger() || 13946 !VT2.isSimple() || !VT2.isInteger()) 13947 return false; 13948 13949 switch (VT1.getSimpleVT().SimpleTy) { 13950 default: break; 13951 case MVT::i8: 13952 case MVT::i16: 13953 case MVT::i32: 13954 // X86 has 8, 16, and 32-bit zero-extending loads. 13955 return true; 13956 } 13957 13958 return false; 13959} 13960 13961bool 13962X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 13963 if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) 13964 return false; 13965 13966 VT = VT.getScalarType(); 13967 13968 if (!VT.isSimple()) 13969 return false; 13970 13971 switch (VT.getSimpleVT().SimpleTy) { 13972 case MVT::f32: 13973 case MVT::f64: 13974 return true; 13975 default: 13976 break; 13977 } 13978 13979 return false; 13980} 13981 13982bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 13983 // i16 instructions are longer (0x66 prefix) and potentially slower. 13984 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 13985} 13986 13987/// isShuffleMaskLegal - Targets can use this to indicate that they only 13988/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 13989/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 13990/// are assumed to be legal. 13991bool 13992X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 13993 EVT VT) const { 13994 if (!VT.isSimple()) 13995 return false; 13996 13997 MVT SVT = VT.getSimpleVT(); 13998 13999 // Very little shuffling can be done for 64-bit vectors right now. 14000 if (VT.getSizeInBits() == 64) 14001 return false; 14002 14003 // FIXME: pshufb, blends, shifts. 14004 return (SVT.getVectorNumElements() == 2 || 14005 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 14006 isMOVLMask(M, SVT) || 14007 isSHUFPMask(M, SVT) || 14008 isPSHUFDMask(M, SVT) || 14009 isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || 14010 isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || 14011 isPALIGNRMask(M, SVT, Subtarget) || 14012 isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || 14013 isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || 14014 isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || 14015 isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256())); 14016} 14017 14018bool 14019X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 14020 EVT VT) const { 14021 if (!VT.isSimple()) 14022 return false; 14023 14024 MVT SVT = VT.getSimpleVT(); 14025 unsigned NumElts = SVT.getVectorNumElements(); 14026 // FIXME: This collection of masks seems suspect. 14027 if (NumElts == 2) 14028 return true; 14029 if (NumElts == 4 && SVT.is128BitVector()) { 14030 return (isMOVLMask(Mask, SVT) || 14031 isCommutedMOVLMask(Mask, SVT, true) || 14032 isSHUFPMask(Mask, SVT) || 14033 isSHUFPMask(Mask, SVT, /* Commuted */ true)); 14034 } 14035 return false; 14036} 14037 14038//===----------------------------------------------------------------------===// 14039// X86 Scheduler Hooks 14040//===----------------------------------------------------------------------===// 14041 14042/// Utility function to emit xbegin specifying the start of an RTM region. 14043static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, 14044 const TargetInstrInfo *TII) { 14045 DebugLoc DL = MI->getDebugLoc(); 14046 14047 const BasicBlock *BB = MBB->getBasicBlock(); 14048 MachineFunction::iterator I = MBB; 14049 ++I; 14050 14051 // For the v = xbegin(), we generate 14052 // 14053 // thisMBB: 14054 // xbegin sinkMBB 14055 // 14056 // mainMBB: 14057 // eax = -1 14058 // 14059 // sinkMBB: 14060 // v = eax 14061 14062 MachineBasicBlock *thisMBB = MBB; 14063 MachineFunction *MF = MBB->getParent(); 14064 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 14065 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 14066 MF->insert(I, mainMBB); 14067 MF->insert(I, sinkMBB); 14068 14069 // Transfer the remainder of BB and its successor edges to sinkMBB. 14070 sinkMBB->splice(sinkMBB->begin(), MBB, 14071 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 14072 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 14073 14074 // thisMBB: 14075 // xbegin sinkMBB 14076 // # fallthrough to mainMBB 14077 // # abortion to sinkMBB 14078 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); 14079 thisMBB->addSuccessor(mainMBB); 14080 thisMBB->addSuccessor(sinkMBB); 14081 14082 // mainMBB: 14083 // EAX = -1 14084 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); 14085 mainMBB->addSuccessor(sinkMBB); 14086 14087 // sinkMBB: 14088 // EAX is live into the sinkMBB 14089 sinkMBB->addLiveIn(X86::EAX); 14090 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14091 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 14092 .addReg(X86::EAX); 14093 14094 MI->eraseFromParent(); 14095 return sinkMBB; 14096} 14097 14098// Get CMPXCHG opcode for the specified data type. 14099static unsigned getCmpXChgOpcode(EVT VT) { 14100 switch (VT.getSimpleVT().SimpleTy) { 14101 case MVT::i8: return X86::LCMPXCHG8; 14102 case MVT::i16: return X86::LCMPXCHG16; 14103 case MVT::i32: return X86::LCMPXCHG32; 14104 case MVT::i64: return X86::LCMPXCHG64; 14105 default: 14106 break; 14107 } 14108 llvm_unreachable("Invalid operand size!"); 14109} 14110 14111// Get LOAD opcode for the specified data type. 14112static unsigned getLoadOpcode(EVT VT) { 14113 switch (VT.getSimpleVT().SimpleTy) { 14114 case MVT::i8: return X86::MOV8rm; 14115 case MVT::i16: return X86::MOV16rm; 14116 case MVT::i32: return X86::MOV32rm; 14117 case MVT::i64: return X86::MOV64rm; 14118 default: 14119 break; 14120 } 14121 llvm_unreachable("Invalid operand size!"); 14122} 14123 14124// Get opcode of the non-atomic one from the specified atomic instruction. 14125static unsigned getNonAtomicOpcode(unsigned Opc) { 14126 switch (Opc) { 14127 case X86::ATOMAND8: return X86::AND8rr; 14128 case X86::ATOMAND16: return X86::AND16rr; 14129 case X86::ATOMAND32: return X86::AND32rr; 14130 case X86::ATOMAND64: return X86::AND64rr; 14131 case X86::ATOMOR8: return X86::OR8rr; 14132 case X86::ATOMOR16: return X86::OR16rr; 14133 case X86::ATOMOR32: return X86::OR32rr; 14134 case X86::ATOMOR64: return X86::OR64rr; 14135 case X86::ATOMXOR8: return X86::XOR8rr; 14136 case X86::ATOMXOR16: return X86::XOR16rr; 14137 case X86::ATOMXOR32: return X86::XOR32rr; 14138 case X86::ATOMXOR64: return X86::XOR64rr; 14139 } 14140 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14141} 14142 14143// Get opcode of the non-atomic one from the specified atomic instruction with 14144// extra opcode. 14145static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, 14146 unsigned &ExtraOpc) { 14147 switch (Opc) { 14148 case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; 14149 case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; 14150 case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; 14151 case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; 14152 case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; 14153 case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; 14154 case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; 14155 case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; 14156 case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; 14157 case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; 14158 case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; 14159 case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; 14160 case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; 14161 case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; 14162 case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; 14163 case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; 14164 case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; 14165 case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; 14166 case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; 14167 case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; 14168 } 14169 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14170} 14171 14172// Get opcode of the non-atomic one from the specified atomic instruction for 14173// 64-bit data type on 32-bit target. 14174static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { 14175 switch (Opc) { 14176 case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; 14177 case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; 14178 case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; 14179 case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; 14180 case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; 14181 case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; 14182 case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; 14183 case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; 14184 case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; 14185 case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; 14186 } 14187 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14188} 14189 14190// Get opcode of the non-atomic one from the specified atomic instruction for 14191// 64-bit data type on 32-bit target with extra opcode. 14192static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, 14193 unsigned &HiOpc, 14194 unsigned &ExtraOpc) { 14195 switch (Opc) { 14196 case X86::ATOMNAND6432: 14197 ExtraOpc = X86::NOT32r; 14198 HiOpc = X86::AND32rr; 14199 return X86::AND32rr; 14200 } 14201 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14202} 14203 14204// Get pseudo CMOV opcode from the specified data type. 14205static unsigned getPseudoCMOVOpc(EVT VT) { 14206 switch (VT.getSimpleVT().SimpleTy) { 14207 case MVT::i8: return X86::CMOV_GR8; 14208 case MVT::i16: return X86::CMOV_GR16; 14209 case MVT::i32: return X86::CMOV_GR32; 14210 default: 14211 break; 14212 } 14213 llvm_unreachable("Unknown CMOV opcode!"); 14214} 14215 14216// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. 14217// They will be translated into a spin-loop or compare-exchange loop from 14218// 14219// ... 14220// dst = atomic-fetch-op MI.addr, MI.val 14221// ... 14222// 14223// to 14224// 14225// ... 14226// t1 = LOAD MI.addr 14227// loop: 14228// t4 = phi(t1, t3 / loop) 14229// t2 = OP MI.val, t4 14230// EAX = t4 14231// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined] 14232// t3 = EAX 14233// JNE loop 14234// sink: 14235// dst = t3 14236// ... 14237MachineBasicBlock * 14238X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, 14239 MachineBasicBlock *MBB) const { 14240 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 14241 DebugLoc DL = MI->getDebugLoc(); 14242 14243 MachineFunction *MF = MBB->getParent(); 14244 MachineRegisterInfo &MRI = MF->getRegInfo(); 14245 14246 const BasicBlock *BB = MBB->getBasicBlock(); 14247 MachineFunction::iterator I = MBB; 14248 ++I; 14249 14250 assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && 14251 "Unexpected number of operands"); 14252 14253 assert(MI->hasOneMemOperand() && 14254 "Expected atomic-load-op to have one memoperand"); 14255 14256 // Memory Reference 14257 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 14258 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 14259 14260 unsigned DstReg, SrcReg; 14261 unsigned MemOpndSlot; 14262 14263 unsigned CurOp = 0; 14264 14265 DstReg = MI->getOperand(CurOp++).getReg(); 14266 MemOpndSlot = CurOp; 14267 CurOp += X86::AddrNumOperands; 14268 SrcReg = MI->getOperand(CurOp++).getReg(); 14269 14270 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 14271 MVT::SimpleValueType VT = *RC->vt_begin(); 14272 unsigned t1 = MRI.createVirtualRegister(RC); 14273 unsigned t2 = MRI.createVirtualRegister(RC); 14274 unsigned t3 = MRI.createVirtualRegister(RC); 14275 unsigned t4 = MRI.createVirtualRegister(RC); 14276 unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT); 14277 14278 unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); 14279 unsigned LOADOpc = getLoadOpcode(VT); 14280 14281 // For the atomic load-arith operator, we generate 14282 // 14283 // thisMBB: 14284 // t1 = LOAD [MI.addr] 14285 // mainMBB: 14286 // t4 = phi(t1 / thisMBB, t3 / mainMBB) 14287 // t1 = OP MI.val, EAX 14288 // EAX = t4 14289 // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] 14290 // t3 = EAX 14291 // JNE mainMBB 14292 // sinkMBB: 14293 // dst = t3 14294 14295 MachineBasicBlock *thisMBB = MBB; 14296 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 14297 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 14298 MF->insert(I, mainMBB); 14299 MF->insert(I, sinkMBB); 14300 14301 MachineInstrBuilder MIB; 14302 14303 // Transfer the remainder of BB and its successor edges to sinkMBB. 14304 sinkMBB->splice(sinkMBB->begin(), MBB, 14305 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 14306 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 14307 14308 // thisMBB: 14309 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1); 14310 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14311 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14312 if (NewMO.isReg()) 14313 NewMO.setIsKill(false); 14314 MIB.addOperand(NewMO); 14315 } 14316 for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { 14317 unsigned flags = (*MMOI)->getFlags(); 14318 flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; 14319 MachineMemOperand *MMO = 14320 MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, 14321 (*MMOI)->getSize(), 14322 (*MMOI)->getBaseAlignment(), 14323 (*MMOI)->getTBAAInfo(), 14324 (*MMOI)->getRanges()); 14325 MIB.addMemOperand(MMO); 14326 } 14327 14328 thisMBB->addSuccessor(mainMBB); 14329 14330 // mainMBB: 14331 MachineBasicBlock *origMainMBB = mainMBB; 14332 14333 // Add a PHI. 14334 MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4) 14335 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); 14336 14337 unsigned Opc = MI->getOpcode(); 14338 switch (Opc) { 14339 default: 14340 llvm_unreachable("Unhandled atomic-load-op opcode!"); 14341 case X86::ATOMAND8: 14342 case X86::ATOMAND16: 14343 case X86::ATOMAND32: 14344 case X86::ATOMAND64: 14345 case X86::ATOMOR8: 14346 case X86::ATOMOR16: 14347 case X86::ATOMOR32: 14348 case X86::ATOMOR64: 14349 case X86::ATOMXOR8: 14350 case X86::ATOMXOR16: 14351 case X86::ATOMXOR32: 14352 case X86::ATOMXOR64: { 14353 unsigned ARITHOpc = getNonAtomicOpcode(Opc); 14354 BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg) 14355 .addReg(t4); 14356 break; 14357 } 14358 case X86::ATOMNAND8: 14359 case X86::ATOMNAND16: 14360 case X86::ATOMNAND32: 14361 case X86::ATOMNAND64: { 14362 unsigned Tmp = MRI.createVirtualRegister(RC); 14363 unsigned NOTOpc; 14364 unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); 14365 BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg) 14366 .addReg(t4); 14367 BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp); 14368 break; 14369 } 14370 case X86::ATOMMAX8: 14371 case X86::ATOMMAX16: 14372 case X86::ATOMMAX32: 14373 case X86::ATOMMAX64: 14374 case X86::ATOMMIN8: 14375 case X86::ATOMMIN16: 14376 case X86::ATOMMIN32: 14377 case X86::ATOMMIN64: 14378 case X86::ATOMUMAX8: 14379 case X86::ATOMUMAX16: 14380 case X86::ATOMUMAX32: 14381 case X86::ATOMUMAX64: 14382 case X86::ATOMUMIN8: 14383 case X86::ATOMUMIN16: 14384 case X86::ATOMUMIN32: 14385 case X86::ATOMUMIN64: { 14386 unsigned CMPOpc; 14387 unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); 14388 14389 BuildMI(mainMBB, DL, TII->get(CMPOpc)) 14390 .addReg(SrcReg) 14391 .addReg(t4); 14392 14393 if (Subtarget->hasCMov()) { 14394 if (VT != MVT::i8) { 14395 // Native support 14396 BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) 14397 .addReg(SrcReg) 14398 .addReg(t4); 14399 } else { 14400 // Promote i8 to i32 to use CMOV32 14401 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 14402 const TargetRegisterClass *RC32 = 14403 TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit); 14404 unsigned SrcReg32 = MRI.createVirtualRegister(RC32); 14405 unsigned AccReg32 = MRI.createVirtualRegister(RC32); 14406 unsigned Tmp = MRI.createVirtualRegister(RC32); 14407 14408 unsigned Undef = MRI.createVirtualRegister(RC32); 14409 BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); 14410 14411 BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) 14412 .addReg(Undef) 14413 .addReg(SrcReg) 14414 .addImm(X86::sub_8bit); 14415 BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) 14416 .addReg(Undef) 14417 .addReg(t4) 14418 .addImm(X86::sub_8bit); 14419 14420 BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp) 14421 .addReg(SrcReg32) 14422 .addReg(AccReg32); 14423 14424 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2) 14425 .addReg(Tmp, 0, X86::sub_8bit); 14426 } 14427 } else { 14428 // Use pseudo select and lower them. 14429 assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && 14430 "Invalid atomic-load-op transformation!"); 14431 unsigned SelOpc = getPseudoCMOVOpc(VT); 14432 X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); 14433 assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); 14434 MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2) 14435 .addReg(SrcReg).addReg(t4) 14436 .addImm(CC); 14437 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14438 // Replace the original PHI node as mainMBB is changed after CMOV 14439 // lowering. 14440 BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4) 14441 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB); 14442 Phi->eraseFromParent(); 14443 } 14444 break; 14445 } 14446 } 14447 14448 // Copy PhyReg back from virtual register. 14449 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg) 14450 .addReg(t4); 14451 14452 MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); 14453 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14454 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14455 if (NewMO.isReg()) 14456 NewMO.setIsKill(false); 14457 MIB.addOperand(NewMO); 14458 } 14459 MIB.addReg(t2); 14460 MIB.setMemRefs(MMOBegin, MMOEnd); 14461 14462 // Copy PhyReg back to virtual register. 14463 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3) 14464 .addReg(PhyReg); 14465 14466 BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); 14467 14468 mainMBB->addSuccessor(origMainMBB); 14469 mainMBB->addSuccessor(sinkMBB); 14470 14471 // sinkMBB: 14472 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14473 TII->get(TargetOpcode::COPY), DstReg) 14474 .addReg(t3); 14475 14476 MI->eraseFromParent(); 14477 return sinkMBB; 14478} 14479 14480// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic 14481// instructions. They will be translated into a spin-loop or compare-exchange 14482// loop from 14483// 14484// ... 14485// dst = atomic-fetch-op MI.addr, MI.val 14486// ... 14487// 14488// to 14489// 14490// ... 14491// t1L = LOAD [MI.addr + 0] 14492// t1H = LOAD [MI.addr + 4] 14493// loop: 14494// t4L = phi(t1L, t3L / loop) 14495// t4H = phi(t1H, t3H / loop) 14496// t2L = OP MI.val.lo, t4L 14497// t2H = OP MI.val.hi, t4H 14498// EAX = t4L 14499// EDX = t4H 14500// EBX = t2L 14501// ECX = t2H 14502// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] 14503// t3L = EAX 14504// t3H = EDX 14505// JNE loop 14506// sink: 14507// dstL = t3L 14508// dstH = t3H 14509// ... 14510MachineBasicBlock * 14511X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, 14512 MachineBasicBlock *MBB) const { 14513 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 14514 DebugLoc DL = MI->getDebugLoc(); 14515 14516 MachineFunction *MF = MBB->getParent(); 14517 MachineRegisterInfo &MRI = MF->getRegInfo(); 14518 14519 const BasicBlock *BB = MBB->getBasicBlock(); 14520 MachineFunction::iterator I = MBB; 14521 ++I; 14522 14523 assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 && 14524 "Unexpected number of operands"); 14525 14526 assert(MI->hasOneMemOperand() && 14527 "Expected atomic-load-op32 to have one memoperand"); 14528 14529 // Memory Reference 14530 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 14531 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 14532 14533 unsigned DstLoReg, DstHiReg; 14534 unsigned SrcLoReg, SrcHiReg; 14535 unsigned MemOpndSlot; 14536 14537 unsigned CurOp = 0; 14538 14539 DstLoReg = MI->getOperand(CurOp++).getReg(); 14540 DstHiReg = MI->getOperand(CurOp++).getReg(); 14541 MemOpndSlot = CurOp; 14542 CurOp += X86::AddrNumOperands; 14543 SrcLoReg = MI->getOperand(CurOp++).getReg(); 14544 SrcHiReg = MI->getOperand(CurOp++).getReg(); 14545 14546 const TargetRegisterClass *RC = &X86::GR32RegClass; 14547 const TargetRegisterClass *RC8 = &X86::GR8RegClass; 14548 14549 unsigned t1L = MRI.createVirtualRegister(RC); 14550 unsigned t1H = MRI.createVirtualRegister(RC); 14551 unsigned t2L = MRI.createVirtualRegister(RC); 14552 unsigned t2H = MRI.createVirtualRegister(RC); 14553 unsigned t3L = MRI.createVirtualRegister(RC); 14554 unsigned t3H = MRI.createVirtualRegister(RC); 14555 unsigned t4L = MRI.createVirtualRegister(RC); 14556 unsigned t4H = MRI.createVirtualRegister(RC); 14557 14558 unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; 14559 unsigned LOADOpc = X86::MOV32rm; 14560 14561 // For the atomic load-arith operator, we generate 14562 // 14563 // thisMBB: 14564 // t1L = LOAD [MI.addr + 0] 14565 // t1H = LOAD [MI.addr + 4] 14566 // mainMBB: 14567 // t4L = phi(t1L / thisMBB, t3L / mainMBB) 14568 // t4H = phi(t1H / thisMBB, t3H / mainMBB) 14569 // t2L = OP MI.val.lo, t4L 14570 // t2H = OP MI.val.hi, t4H 14571 // EBX = t2L 14572 // ECX = t2H 14573 // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] 14574 // t3L = EAX 14575 // t3H = EDX 14576 // JNE loop 14577 // sinkMBB: 14578 // dstL = t3L 14579 // dstH = t3H 14580 14581 MachineBasicBlock *thisMBB = MBB; 14582 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 14583 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 14584 MF->insert(I, mainMBB); 14585 MF->insert(I, sinkMBB); 14586 14587 MachineInstrBuilder MIB; 14588 14589 // Transfer the remainder of BB and its successor edges to sinkMBB. 14590 sinkMBB->splice(sinkMBB->begin(), MBB, 14591 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 14592 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 14593 14594 // thisMBB: 14595 // Lo 14596 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L); 14597 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14598 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14599 if (NewMO.isReg()) 14600 NewMO.setIsKill(false); 14601 MIB.addOperand(NewMO); 14602 } 14603 for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) { 14604 unsigned flags = (*MMOI)->getFlags(); 14605 flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad; 14606 MachineMemOperand *MMO = 14607 MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags, 14608 (*MMOI)->getSize(), 14609 (*MMOI)->getBaseAlignment(), 14610 (*MMOI)->getTBAAInfo(), 14611 (*MMOI)->getRanges()); 14612 MIB.addMemOperand(MMO); 14613 }; 14614 MachineInstr *LowMI = MIB; 14615 14616 // Hi 14617 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H); 14618 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14619 if (i == X86::AddrDisp) { 14620 MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) 14621 } else { 14622 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14623 if (NewMO.isReg()) 14624 NewMO.setIsKill(false); 14625 MIB.addOperand(NewMO); 14626 } 14627 } 14628 MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end()); 14629 14630 thisMBB->addSuccessor(mainMBB); 14631 14632 // mainMBB: 14633 MachineBasicBlock *origMainMBB = mainMBB; 14634 14635 // Add PHIs. 14636 MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L) 14637 .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); 14638 MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H) 14639 .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); 14640 14641 unsigned Opc = MI->getOpcode(); 14642 switch (Opc) { 14643 default: 14644 llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); 14645 case X86::ATOMAND6432: 14646 case X86::ATOMOR6432: 14647 case X86::ATOMXOR6432: 14648 case X86::ATOMADD6432: 14649 case X86::ATOMSUB6432: { 14650 unsigned HiOpc; 14651 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 14652 BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L) 14653 .addReg(SrcLoReg); 14654 BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H) 14655 .addReg(SrcHiReg); 14656 break; 14657 } 14658 case X86::ATOMNAND6432: { 14659 unsigned HiOpc, NOTOpc; 14660 unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); 14661 unsigned TmpL = MRI.createVirtualRegister(RC); 14662 unsigned TmpH = MRI.createVirtualRegister(RC); 14663 BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg) 14664 .addReg(t4L); 14665 BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg) 14666 .addReg(t4H); 14667 BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL); 14668 BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH); 14669 break; 14670 } 14671 case X86::ATOMMAX6432: 14672 case X86::ATOMMIN6432: 14673 case X86::ATOMUMAX6432: 14674 case X86::ATOMUMIN6432: { 14675 unsigned HiOpc; 14676 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 14677 unsigned cL = MRI.createVirtualRegister(RC8); 14678 unsigned cH = MRI.createVirtualRegister(RC8); 14679 unsigned cL32 = MRI.createVirtualRegister(RC); 14680 unsigned cH32 = MRI.createVirtualRegister(RC); 14681 unsigned cc = MRI.createVirtualRegister(RC); 14682 // cl := cmp src_lo, lo 14683 BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) 14684 .addReg(SrcLoReg).addReg(t4L); 14685 BuildMI(mainMBB, DL, TII->get(LoOpc), cL); 14686 BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); 14687 // ch := cmp src_hi, hi 14688 BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) 14689 .addReg(SrcHiReg).addReg(t4H); 14690 BuildMI(mainMBB, DL, TII->get(HiOpc), cH); 14691 BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); 14692 // cc := if (src_hi == hi) ? cl : ch; 14693 if (Subtarget->hasCMov()) { 14694 BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) 14695 .addReg(cH32).addReg(cL32); 14696 } else { 14697 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) 14698 .addReg(cH32).addReg(cL32) 14699 .addImm(X86::COND_E); 14700 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14701 } 14702 BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); 14703 if (Subtarget->hasCMov()) { 14704 BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L) 14705 .addReg(SrcLoReg).addReg(t4L); 14706 BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H) 14707 .addReg(SrcHiReg).addReg(t4H); 14708 } else { 14709 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L) 14710 .addReg(SrcLoReg).addReg(t4L) 14711 .addImm(X86::COND_NE); 14712 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14713 // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the 14714 // 2nd CMOV lowering. 14715 mainMBB->addLiveIn(X86::EFLAGS); 14716 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H) 14717 .addReg(SrcHiReg).addReg(t4H) 14718 .addImm(X86::COND_NE); 14719 mainMBB = EmitLoweredSelect(MIB, mainMBB); 14720 // Replace the original PHI node as mainMBB is changed after CMOV 14721 // lowering. 14722 BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L) 14723 .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB); 14724 BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H) 14725 .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB); 14726 PhiL->eraseFromParent(); 14727 PhiH->eraseFromParent(); 14728 } 14729 break; 14730 } 14731 case X86::ATOMSWAP6432: { 14732 unsigned HiOpc; 14733 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 14734 BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg); 14735 BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg); 14736 break; 14737 } 14738 } 14739 14740 // Copy EDX:EAX back from HiReg:LoReg 14741 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L); 14742 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H); 14743 // Copy ECX:EBX from t1H:t1L 14744 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L); 14745 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H); 14746 14747 MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); 14748 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 14749 MachineOperand NewMO = MI->getOperand(MemOpndSlot + i); 14750 if (NewMO.isReg()) 14751 NewMO.setIsKill(false); 14752 MIB.addOperand(NewMO); 14753 } 14754 MIB.setMemRefs(MMOBegin, MMOEnd); 14755 14756 // Copy EDX:EAX back to t3H:t3L 14757 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX); 14758 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX); 14759 14760 BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); 14761 14762 mainMBB->addSuccessor(origMainMBB); 14763 mainMBB->addSuccessor(sinkMBB); 14764 14765 // sinkMBB: 14766 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14767 TII->get(TargetOpcode::COPY), DstLoReg) 14768 .addReg(t3L); 14769 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 14770 TII->get(TargetOpcode::COPY), DstHiReg) 14771 .addReg(t3H); 14772 14773 MI->eraseFromParent(); 14774 return sinkMBB; 14775} 14776 14777// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 14778// or XMM0_V32I8 in AVX all of this code can be replaced with that 14779// in the .td file. 14780static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, 14781 const TargetInstrInfo *TII) { 14782 unsigned Opc; 14783 switch (MI->getOpcode()) { 14784 default: llvm_unreachable("illegal opcode!"); 14785 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; 14786 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; 14787 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; 14788 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; 14789 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; 14790 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; 14791 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; 14792 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; 14793 } 14794 14795 DebugLoc dl = MI->getDebugLoc(); 14796 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 14797 14798 unsigned NumArgs = MI->getNumOperands(); 14799 for (unsigned i = 1; i < NumArgs; ++i) { 14800 MachineOperand &Op = MI->getOperand(i); 14801 if (!(Op.isReg() && Op.isImplicit())) 14802 MIB.addOperand(Op); 14803 } 14804 if (MI->hasOneMemOperand()) 14805 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 14806 14807 BuildMI(*BB, MI, dl, 14808 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 14809 .addReg(X86::XMM0); 14810 14811 MI->eraseFromParent(); 14812 return BB; 14813} 14814 14815// FIXME: Custom handling because TableGen doesn't support multiple implicit 14816// defs in an instruction pattern 14817static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, 14818 const TargetInstrInfo *TII) { 14819 unsigned Opc; 14820 switch (MI->getOpcode()) { 14821 default: llvm_unreachable("illegal opcode!"); 14822 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; 14823 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; 14824 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; 14825 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; 14826 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; 14827 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; 14828 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; 14829 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; 14830 } 14831 14832 DebugLoc dl = MI->getDebugLoc(); 14833 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 14834 14835 unsigned NumArgs = MI->getNumOperands(); // remove the results 14836 for (unsigned i = 1; i < NumArgs; ++i) { 14837 MachineOperand &Op = MI->getOperand(i); 14838 if (!(Op.isReg() && Op.isImplicit())) 14839 MIB.addOperand(Op); 14840 } 14841 if (MI->hasOneMemOperand()) 14842 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 14843 14844 BuildMI(*BB, MI, dl, 14845 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 14846 .addReg(X86::ECX); 14847 14848 MI->eraseFromParent(); 14849 return BB; 14850} 14851 14852static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, 14853 const TargetInstrInfo *TII, 14854 const X86Subtarget* Subtarget) { 14855 DebugLoc dl = MI->getDebugLoc(); 14856 14857 // Address into RAX/EAX, other two args into ECX, EDX. 14858 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 14859 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 14860 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 14861 for (int i = 0; i < X86::AddrNumOperands; ++i) 14862 MIB.addOperand(MI->getOperand(i)); 14863 14864 unsigned ValOps = X86::AddrNumOperands; 14865 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 14866 .addReg(MI->getOperand(ValOps).getReg()); 14867 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 14868 .addReg(MI->getOperand(ValOps+1).getReg()); 14869 14870 // The instruction doesn't actually take any operands though. 14871 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 14872 14873 MI->eraseFromParent(); // The pseudo is gone now. 14874 return BB; 14875} 14876 14877MachineBasicBlock * 14878X86TargetLowering::EmitVAARG64WithCustomInserter( 14879 MachineInstr *MI, 14880 MachineBasicBlock *MBB) const { 14881 // Emit va_arg instruction on X86-64. 14882 14883 // Operands to this pseudo-instruction: 14884 // 0 ) Output : destination address (reg) 14885 // 1-5) Input : va_list address (addr, i64mem) 14886 // 6 ) ArgSize : Size (in bytes) of vararg type 14887 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 14888 // 8 ) Align : Alignment of type 14889 // 9 ) EFLAGS (implicit-def) 14890 14891 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 14892 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 14893 14894 unsigned DestReg = MI->getOperand(0).getReg(); 14895 MachineOperand &Base = MI->getOperand(1); 14896 MachineOperand &Scale = MI->getOperand(2); 14897 MachineOperand &Index = MI->getOperand(3); 14898 MachineOperand &Disp = MI->getOperand(4); 14899 MachineOperand &Segment = MI->getOperand(5); 14900 unsigned ArgSize = MI->getOperand(6).getImm(); 14901 unsigned ArgMode = MI->getOperand(7).getImm(); 14902 unsigned Align = MI->getOperand(8).getImm(); 14903 14904 // Memory Reference 14905 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 14906 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 14907 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 14908 14909 // Machine Information 14910 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 14911 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 14912 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 14913 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 14914 DebugLoc DL = MI->getDebugLoc(); 14915 14916 // struct va_list { 14917 // i32 gp_offset 14918 // i32 fp_offset 14919 // i64 overflow_area (address) 14920 // i64 reg_save_area (address) 14921 // } 14922 // sizeof(va_list) = 24 14923 // alignment(va_list) = 8 14924 14925 unsigned TotalNumIntRegs = 6; 14926 unsigned TotalNumXMMRegs = 8; 14927 bool UseGPOffset = (ArgMode == 1); 14928 bool UseFPOffset = (ArgMode == 2); 14929 unsigned MaxOffset = TotalNumIntRegs * 8 + 14930 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 14931 14932 /* Align ArgSize to a multiple of 8 */ 14933 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 14934 bool NeedsAlign = (Align > 8); 14935 14936 MachineBasicBlock *thisMBB = MBB; 14937 MachineBasicBlock *overflowMBB; 14938 MachineBasicBlock *offsetMBB; 14939 MachineBasicBlock *endMBB; 14940 14941 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 14942 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 14943 unsigned OffsetReg = 0; 14944 14945 if (!UseGPOffset && !UseFPOffset) { 14946 // If we only pull from the overflow region, we don't create a branch. 14947 // We don't need to alter control flow. 14948 OffsetDestReg = 0; // unused 14949 OverflowDestReg = DestReg; 14950 14951 offsetMBB = NULL; 14952 overflowMBB = thisMBB; 14953 endMBB = thisMBB; 14954 } else { 14955 // First emit code to check if gp_offset (or fp_offset) is below the bound. 14956 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 14957 // If not, pull from overflow_area. (branch to overflowMBB) 14958 // 14959 // thisMBB 14960 // | . 14961 // | . 14962 // offsetMBB overflowMBB 14963 // | . 14964 // | . 14965 // endMBB 14966 14967 // Registers for the PHI in endMBB 14968 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 14969 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 14970 14971 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 14972 MachineFunction *MF = MBB->getParent(); 14973 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 14974 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 14975 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 14976 14977 MachineFunction::iterator MBBIter = MBB; 14978 ++MBBIter; 14979 14980 // Insert the new basic blocks 14981 MF->insert(MBBIter, offsetMBB); 14982 MF->insert(MBBIter, overflowMBB); 14983 MF->insert(MBBIter, endMBB); 14984 14985 // Transfer the remainder of MBB and its successor edges to endMBB. 14986 endMBB->splice(endMBB->begin(), thisMBB, 14987 llvm::next(MachineBasicBlock::iterator(MI)), 14988 thisMBB->end()); 14989 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 14990 14991 // Make offsetMBB and overflowMBB successors of thisMBB 14992 thisMBB->addSuccessor(offsetMBB); 14993 thisMBB->addSuccessor(overflowMBB); 14994 14995 // endMBB is a successor of both offsetMBB and overflowMBB 14996 offsetMBB->addSuccessor(endMBB); 14997 overflowMBB->addSuccessor(endMBB); 14998 14999 // Load the offset value into a register 15000 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 15001 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 15002 .addOperand(Base) 15003 .addOperand(Scale) 15004 .addOperand(Index) 15005 .addDisp(Disp, UseFPOffset ? 4 : 0) 15006 .addOperand(Segment) 15007 .setMemRefs(MMOBegin, MMOEnd); 15008 15009 // Check if there is enough room left to pull this argument. 15010 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 15011 .addReg(OffsetReg) 15012 .addImm(MaxOffset + 8 - ArgSizeA8); 15013 15014 // Branch to "overflowMBB" if offset >= max 15015 // Fall through to "offsetMBB" otherwise 15016 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 15017 .addMBB(overflowMBB); 15018 } 15019 15020 // In offsetMBB, emit code to use the reg_save_area. 15021 if (offsetMBB) { 15022 assert(OffsetReg != 0); 15023 15024 // Read the reg_save_area address. 15025 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 15026 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 15027 .addOperand(Base) 15028 .addOperand(Scale) 15029 .addOperand(Index) 15030 .addDisp(Disp, 16) 15031 .addOperand(Segment) 15032 .setMemRefs(MMOBegin, MMOEnd); 15033 15034 // Zero-extend the offset 15035 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 15036 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 15037 .addImm(0) 15038 .addReg(OffsetReg) 15039 .addImm(X86::sub_32bit); 15040 15041 // Add the offset to the reg_save_area to get the final address. 15042 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 15043 .addReg(OffsetReg64) 15044 .addReg(RegSaveReg); 15045 15046 // Compute the offset for the next argument 15047 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 15048 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 15049 .addReg(OffsetReg) 15050 .addImm(UseFPOffset ? 16 : 8); 15051 15052 // Store it back into the va_list. 15053 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 15054 .addOperand(Base) 15055 .addOperand(Scale) 15056 .addOperand(Index) 15057 .addDisp(Disp, UseFPOffset ? 4 : 0) 15058 .addOperand(Segment) 15059 .addReg(NextOffsetReg) 15060 .setMemRefs(MMOBegin, MMOEnd); 15061 15062 // Jump to endMBB 15063 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 15064 .addMBB(endMBB); 15065 } 15066 15067 // 15068 // Emit code to use overflow area 15069 // 15070 15071 // Load the overflow_area address into a register. 15072 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 15073 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 15074 .addOperand(Base) 15075 .addOperand(Scale) 15076 .addOperand(Index) 15077 .addDisp(Disp, 8) 15078 .addOperand(Segment) 15079 .setMemRefs(MMOBegin, MMOEnd); 15080 15081 // If we need to align it, do so. Otherwise, just copy the address 15082 // to OverflowDestReg. 15083 if (NeedsAlign) { 15084 // Align the overflow address 15085 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 15086 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 15087 15088 // aligned_addr = (addr + (align-1)) & ~(align-1) 15089 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 15090 .addReg(OverflowAddrReg) 15091 .addImm(Align-1); 15092 15093 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 15094 .addReg(TmpReg) 15095 .addImm(~(uint64_t)(Align-1)); 15096 } else { 15097 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 15098 .addReg(OverflowAddrReg); 15099 } 15100 15101 // Compute the next overflow address after this argument. 15102 // (the overflow address should be kept 8-byte aligned) 15103 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 15104 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 15105 .addReg(OverflowDestReg) 15106 .addImm(ArgSizeA8); 15107 15108 // Store the new overflow address. 15109 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 15110 .addOperand(Base) 15111 .addOperand(Scale) 15112 .addOperand(Index) 15113 .addDisp(Disp, 8) 15114 .addOperand(Segment) 15115 .addReg(NextAddrReg) 15116 .setMemRefs(MMOBegin, MMOEnd); 15117 15118 // If we branched, emit the PHI to the front of endMBB. 15119 if (offsetMBB) { 15120 BuildMI(*endMBB, endMBB->begin(), DL, 15121 TII->get(X86::PHI), DestReg) 15122 .addReg(OffsetDestReg).addMBB(offsetMBB) 15123 .addReg(OverflowDestReg).addMBB(overflowMBB); 15124 } 15125 15126 // Erase the pseudo instruction 15127 MI->eraseFromParent(); 15128 15129 return endMBB; 15130} 15131 15132MachineBasicBlock * 15133X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 15134 MachineInstr *MI, 15135 MachineBasicBlock *MBB) const { 15136 // Emit code to save XMM registers to the stack. The ABI says that the 15137 // number of registers to save is given in %al, so it's theoretically 15138 // possible to do an indirect jump trick to avoid saving all of them, 15139 // however this code takes a simpler approach and just executes all 15140 // of the stores if %al is non-zero. It's less code, and it's probably 15141 // easier on the hardware branch predictor, and stores aren't all that 15142 // expensive anyway. 15143 15144 // Create the new basic blocks. One block contains all the XMM stores, 15145 // and one block is the final destination regardless of whether any 15146 // stores were performed. 15147 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 15148 MachineFunction *F = MBB->getParent(); 15149 MachineFunction::iterator MBBIter = MBB; 15150 ++MBBIter; 15151 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 15152 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 15153 F->insert(MBBIter, XMMSaveMBB); 15154 F->insert(MBBIter, EndMBB); 15155 15156 // Transfer the remainder of MBB and its successor edges to EndMBB. 15157 EndMBB->splice(EndMBB->begin(), MBB, 15158 llvm::next(MachineBasicBlock::iterator(MI)), 15159 MBB->end()); 15160 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 15161 15162 // The original block will now fall through to the XMM save block. 15163 MBB->addSuccessor(XMMSaveMBB); 15164 // The XMMSaveMBB will fall through to the end block. 15165 XMMSaveMBB->addSuccessor(EndMBB); 15166 15167 // Now add the instructions. 15168 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15169 DebugLoc DL = MI->getDebugLoc(); 15170 15171 unsigned CountReg = MI->getOperand(0).getReg(); 15172 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 15173 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 15174 15175 if (!Subtarget->isTargetWin64()) { 15176 // If %al is 0, branch around the XMM save block. 15177 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 15178 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 15179 MBB->addSuccessor(EndMBB); 15180 } 15181 15182 unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; 15183 // In the XMM save block, save all the XMM argument registers. 15184 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 15185 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 15186 MachineMemOperand *MMO = 15187 F->getMachineMemOperand( 15188 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 15189 MachineMemOperand::MOStore, 15190 /*Size=*/16, /*Align=*/16); 15191 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 15192 .addFrameIndex(RegSaveFrameIndex) 15193 .addImm(/*Scale=*/1) 15194 .addReg(/*IndexReg=*/0) 15195 .addImm(/*Disp=*/Offset) 15196 .addReg(/*Segment=*/0) 15197 .addReg(MI->getOperand(i).getReg()) 15198 .addMemOperand(MMO); 15199 } 15200 15201 MI->eraseFromParent(); // The pseudo instruction is gone now. 15202 15203 return EndMBB; 15204} 15205 15206// The EFLAGS operand of SelectItr might be missing a kill marker 15207// because there were multiple uses of EFLAGS, and ISel didn't know 15208// which to mark. Figure out whether SelectItr should have had a 15209// kill marker, and set it if it should. Returns the correct kill 15210// marker value. 15211static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 15212 MachineBasicBlock* BB, 15213 const TargetRegisterInfo* TRI) { 15214 // Scan forward through BB for a use/def of EFLAGS. 15215 MachineBasicBlock::iterator miI(llvm::next(SelectItr)); 15216 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 15217 const MachineInstr& mi = *miI; 15218 if (mi.readsRegister(X86::EFLAGS)) 15219 return false; 15220 if (mi.definesRegister(X86::EFLAGS)) 15221 break; // Should have kill-flag - update below. 15222 } 15223 15224 // If we hit the end of the block, check whether EFLAGS is live into a 15225 // successor. 15226 if (miI == BB->end()) { 15227 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 15228 sEnd = BB->succ_end(); 15229 sItr != sEnd; ++sItr) { 15230 MachineBasicBlock* succ = *sItr; 15231 if (succ->isLiveIn(X86::EFLAGS)) 15232 return false; 15233 } 15234 } 15235 15236 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 15237 // out. SelectMI should have a kill flag on EFLAGS. 15238 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 15239 return true; 15240} 15241 15242MachineBasicBlock * 15243X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 15244 MachineBasicBlock *BB) const { 15245 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15246 DebugLoc DL = MI->getDebugLoc(); 15247 15248 // To "insert" a SELECT_CC instruction, we actually have to insert the 15249 // diamond control-flow pattern. The incoming instruction knows the 15250 // destination vreg to set, the condition code register to branch on, the 15251 // true/false values to select between, and a branch opcode to use. 15252 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 15253 MachineFunction::iterator It = BB; 15254 ++It; 15255 15256 // thisMBB: 15257 // ... 15258 // TrueVal = ... 15259 // cmpTY ccX, r1, r2 15260 // bCC copy1MBB 15261 // fallthrough --> copy0MBB 15262 MachineBasicBlock *thisMBB = BB; 15263 MachineFunction *F = BB->getParent(); 15264 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 15265 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 15266 F->insert(It, copy0MBB); 15267 F->insert(It, sinkMBB); 15268 15269 // If the EFLAGS register isn't dead in the terminator, then claim that it's 15270 // live into the sink and copy blocks. 15271 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 15272 if (!MI->killsRegister(X86::EFLAGS) && 15273 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { 15274 copy0MBB->addLiveIn(X86::EFLAGS); 15275 sinkMBB->addLiveIn(X86::EFLAGS); 15276 } 15277 15278 // Transfer the remainder of BB and its successor edges to sinkMBB. 15279 sinkMBB->splice(sinkMBB->begin(), BB, 15280 llvm::next(MachineBasicBlock::iterator(MI)), 15281 BB->end()); 15282 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 15283 15284 // Add the true and fallthrough blocks as its successors. 15285 BB->addSuccessor(copy0MBB); 15286 BB->addSuccessor(sinkMBB); 15287 15288 // Create the conditional branch instruction. 15289 unsigned Opc = 15290 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 15291 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 15292 15293 // copy0MBB: 15294 // %FalseValue = ... 15295 // # fallthrough to sinkMBB 15296 copy0MBB->addSuccessor(sinkMBB); 15297 15298 // sinkMBB: 15299 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 15300 // ... 15301 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 15302 TII->get(X86::PHI), MI->getOperand(0).getReg()) 15303 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 15304 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 15305 15306 MI->eraseFromParent(); // The pseudo instruction is gone now. 15307 return sinkMBB; 15308} 15309 15310MachineBasicBlock * 15311X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, 15312 bool Is64Bit) const { 15313 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15314 DebugLoc DL = MI->getDebugLoc(); 15315 MachineFunction *MF = BB->getParent(); 15316 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 15317 15318 assert(getTargetMachine().Options.EnableSegmentedStacks); 15319 15320 unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 15321 unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; 15322 15323 // BB: 15324 // ... [Till the alloca] 15325 // If stacklet is not large enough, jump to mallocMBB 15326 // 15327 // bumpMBB: 15328 // Allocate by subtracting from RSP 15329 // Jump to continueMBB 15330 // 15331 // mallocMBB: 15332 // Allocate by call to runtime 15333 // 15334 // continueMBB: 15335 // ... 15336 // [rest of original BB] 15337 // 15338 15339 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15340 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15341 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 15342 15343 MachineRegisterInfo &MRI = MF->getRegInfo(); 15344 const TargetRegisterClass *AddrRegClass = 15345 getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); 15346 15347 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 15348 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 15349 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 15350 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 15351 sizeVReg = MI->getOperand(1).getReg(), 15352 physSPReg = Is64Bit ? X86::RSP : X86::ESP; 15353 15354 MachineFunction::iterator MBBIter = BB; 15355 ++MBBIter; 15356 15357 MF->insert(MBBIter, bumpMBB); 15358 MF->insert(MBBIter, mallocMBB); 15359 MF->insert(MBBIter, continueMBB); 15360 15361 continueMBB->splice(continueMBB->begin(), BB, llvm::next 15362 (MachineBasicBlock::iterator(MI)), BB->end()); 15363 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 15364 15365 // Add code to the main basic block to check if the stack limit has been hit, 15366 // and if so, jump to mallocMBB otherwise to bumpMBB. 15367 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 15368 BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 15369 .addReg(tmpSPVReg).addReg(sizeVReg); 15370 BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) 15371 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 15372 .addReg(SPLimitVReg); 15373 BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); 15374 15375 // bumpMBB simply decreases the stack pointer, since we know the current 15376 // stacklet has enough space. 15377 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 15378 .addReg(SPLimitVReg); 15379 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 15380 .addReg(SPLimitVReg); 15381 BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 15382 15383 // Calls into a routine in libgcc to allocate more space from the heap. 15384 const uint32_t *RegMask = 15385 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 15386 if (Is64Bit) { 15387 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 15388 .addReg(sizeVReg); 15389 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 15390 .addExternalSymbol("__morestack_allocate_stack_space") 15391 .addRegMask(RegMask) 15392 .addReg(X86::RDI, RegState::Implicit) 15393 .addReg(X86::RAX, RegState::ImplicitDefine); 15394 } else { 15395 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 15396 .addImm(12); 15397 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 15398 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 15399 .addExternalSymbol("__morestack_allocate_stack_space") 15400 .addRegMask(RegMask) 15401 .addReg(X86::EAX, RegState::ImplicitDefine); 15402 } 15403 15404 if (!Is64Bit) 15405 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 15406 .addImm(16); 15407 15408 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 15409 .addReg(Is64Bit ? X86::RAX : X86::EAX); 15410 BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 15411 15412 // Set up the CFG correctly. 15413 BB->addSuccessor(bumpMBB); 15414 BB->addSuccessor(mallocMBB); 15415 mallocMBB->addSuccessor(continueMBB); 15416 bumpMBB->addSuccessor(continueMBB); 15417 15418 // Take care of the PHI nodes. 15419 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 15420 MI->getOperand(0).getReg()) 15421 .addReg(mallocPtrVReg).addMBB(mallocMBB) 15422 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 15423 15424 // Delete the original pseudo instruction. 15425 MI->eraseFromParent(); 15426 15427 // And we're done. 15428 return continueMBB; 15429} 15430 15431MachineBasicBlock * 15432X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 15433 MachineBasicBlock *BB) const { 15434 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15435 DebugLoc DL = MI->getDebugLoc(); 15436 15437 assert(!Subtarget->isTargetEnvMacho()); 15438 15439 // The lowering is pretty easy: we're just emitting the call to _alloca. The 15440 // non-trivial part is impdef of ESP. 15441 15442 if (Subtarget->isTargetWin64()) { 15443 if (Subtarget->isTargetCygMing()) { 15444 // ___chkstk(Mingw64): 15445 // Clobbers R10, R11, RAX and EFLAGS. 15446 // Updates RSP. 15447 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 15448 .addExternalSymbol("___chkstk") 15449 .addReg(X86::RAX, RegState::Implicit) 15450 .addReg(X86::RSP, RegState::Implicit) 15451 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 15452 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 15453 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 15454 } else { 15455 // __chkstk(MSVCRT): does not update stack pointer. 15456 // Clobbers R10, R11 and EFLAGS. 15457 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 15458 .addExternalSymbol("__chkstk") 15459 .addReg(X86::RAX, RegState::Implicit) 15460 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 15461 // RAX has the offset to be subtracted from RSP. 15462 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 15463 .addReg(X86::RSP) 15464 .addReg(X86::RAX); 15465 } 15466 } else { 15467 const char *StackProbeSymbol = 15468 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 15469 15470 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 15471 .addExternalSymbol(StackProbeSymbol) 15472 .addReg(X86::EAX, RegState::Implicit) 15473 .addReg(X86::ESP, RegState::Implicit) 15474 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 15475 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 15476 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 15477 } 15478 15479 MI->eraseFromParent(); // The pseudo instruction is gone now. 15480 return BB; 15481} 15482 15483MachineBasicBlock * 15484X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 15485 MachineBasicBlock *BB) const { 15486 // This is pretty easy. We're taking the value that we received from 15487 // our load from the relocation, sticking it in either RDI (x86-64) 15488 // or EAX and doing an indirect call. The return value will then 15489 // be in the normal return register. 15490 const X86InstrInfo *TII 15491 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 15492 DebugLoc DL = MI->getDebugLoc(); 15493 MachineFunction *F = BB->getParent(); 15494 15495 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 15496 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 15497 15498 // Get a register mask for the lowered call. 15499 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 15500 // proper register mask. 15501 const uint32_t *RegMask = 15502 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 15503 if (Subtarget->is64Bit()) { 15504 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 15505 TII->get(X86::MOV64rm), X86::RDI) 15506 .addReg(X86::RIP) 15507 .addImm(0).addReg(0) 15508 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 15509 MI->getOperand(3).getTargetFlags()) 15510 .addReg(0); 15511 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 15512 addDirectMem(MIB, X86::RDI); 15513 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 15514 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 15515 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 15516 TII->get(X86::MOV32rm), X86::EAX) 15517 .addReg(0) 15518 .addImm(0).addReg(0) 15519 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 15520 MI->getOperand(3).getTargetFlags()) 15521 .addReg(0); 15522 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 15523 addDirectMem(MIB, X86::EAX); 15524 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 15525 } else { 15526 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 15527 TII->get(X86::MOV32rm), X86::EAX) 15528 .addReg(TII->getGlobalBaseReg(F)) 15529 .addImm(0).addReg(0) 15530 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 15531 MI->getOperand(3).getTargetFlags()) 15532 .addReg(0); 15533 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 15534 addDirectMem(MIB, X86::EAX); 15535 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 15536 } 15537 15538 MI->eraseFromParent(); // The pseudo instruction is gone now. 15539 return BB; 15540} 15541 15542MachineBasicBlock * 15543X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, 15544 MachineBasicBlock *MBB) const { 15545 DebugLoc DL = MI->getDebugLoc(); 15546 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15547 15548 MachineFunction *MF = MBB->getParent(); 15549 MachineRegisterInfo &MRI = MF->getRegInfo(); 15550 15551 const BasicBlock *BB = MBB->getBasicBlock(); 15552 MachineFunction::iterator I = MBB; 15553 ++I; 15554 15555 // Memory Reference 15556 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 15557 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 15558 15559 unsigned DstReg; 15560 unsigned MemOpndSlot = 0; 15561 15562 unsigned CurOp = 0; 15563 15564 DstReg = MI->getOperand(CurOp++).getReg(); 15565 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 15566 assert(RC->hasType(MVT::i32) && "Invalid destination!"); 15567 unsigned mainDstReg = MRI.createVirtualRegister(RC); 15568 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 15569 15570 MemOpndSlot = CurOp; 15571 15572 MVT PVT = getPointerTy(); 15573 assert((PVT == MVT::i64 || PVT == MVT::i32) && 15574 "Invalid Pointer Size!"); 15575 15576 // For v = setjmp(buf), we generate 15577 // 15578 // thisMBB: 15579 // buf[LabelOffset] = restoreMBB 15580 // SjLjSetup restoreMBB 15581 // 15582 // mainMBB: 15583 // v_main = 0 15584 // 15585 // sinkMBB: 15586 // v = phi(main, restore) 15587 // 15588 // restoreMBB: 15589 // v_restore = 1 15590 15591 MachineBasicBlock *thisMBB = MBB; 15592 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 15593 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 15594 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); 15595 MF->insert(I, mainMBB); 15596 MF->insert(I, sinkMBB); 15597 MF->push_back(restoreMBB); 15598 15599 MachineInstrBuilder MIB; 15600 15601 // Transfer the remainder of BB and its successor edges to sinkMBB. 15602 sinkMBB->splice(sinkMBB->begin(), MBB, 15603 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 15604 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 15605 15606 // thisMBB: 15607 unsigned PtrStoreOpc = 0; 15608 unsigned LabelReg = 0; 15609 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 15610 Reloc::Model RM = getTargetMachine().getRelocationModel(); 15611 bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) && 15612 (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); 15613 15614 // Prepare IP either in reg or imm. 15615 if (!UseImmLabel) { 15616 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; 15617 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 15618 LabelReg = MRI.createVirtualRegister(PtrRC); 15619 if (Subtarget->is64Bit()) { 15620 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) 15621 .addReg(X86::RIP) 15622 .addImm(0) 15623 .addReg(0) 15624 .addMBB(restoreMBB) 15625 .addReg(0); 15626 } else { 15627 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); 15628 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) 15629 .addReg(XII->getGlobalBaseReg(MF)) 15630 .addImm(0) 15631 .addReg(0) 15632 .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) 15633 .addReg(0); 15634 } 15635 } else 15636 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; 15637 // Store IP 15638 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); 15639 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 15640 if (i == X86::AddrDisp) 15641 MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); 15642 else 15643 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 15644 } 15645 if (!UseImmLabel) 15646 MIB.addReg(LabelReg); 15647 else 15648 MIB.addMBB(restoreMBB); 15649 MIB.setMemRefs(MMOBegin, MMOEnd); 15650 // Setup 15651 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) 15652 .addMBB(restoreMBB); 15653 15654 const X86RegisterInfo *RegInfo = 15655 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 15656 MIB.addRegMask(RegInfo->getNoPreservedMask()); 15657 thisMBB->addSuccessor(mainMBB); 15658 thisMBB->addSuccessor(restoreMBB); 15659 15660 // mainMBB: 15661 // EAX = 0 15662 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); 15663 mainMBB->addSuccessor(sinkMBB); 15664 15665 // sinkMBB: 15666 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 15667 TII->get(X86::PHI), DstReg) 15668 .addReg(mainDstReg).addMBB(mainMBB) 15669 .addReg(restoreDstReg).addMBB(restoreMBB); 15670 15671 // restoreMBB: 15672 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); 15673 BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); 15674 restoreMBB->addSuccessor(sinkMBB); 15675 15676 MI->eraseFromParent(); 15677 return sinkMBB; 15678} 15679 15680MachineBasicBlock * 15681X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, 15682 MachineBasicBlock *MBB) const { 15683 DebugLoc DL = MI->getDebugLoc(); 15684 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15685 15686 MachineFunction *MF = MBB->getParent(); 15687 MachineRegisterInfo &MRI = MF->getRegInfo(); 15688 15689 // Memory Reference 15690 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 15691 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 15692 15693 MVT PVT = getPointerTy(); 15694 assert((PVT == MVT::i64 || PVT == MVT::i32) && 15695 "Invalid Pointer Size!"); 15696 15697 const TargetRegisterClass *RC = 15698 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; 15699 unsigned Tmp = MRI.createVirtualRegister(RC); 15700 // Since FP is only updated here but NOT referenced, it's treated as GPR. 15701 const X86RegisterInfo *RegInfo = 15702 static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); 15703 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; 15704 unsigned SP = RegInfo->getStackRegister(); 15705 15706 MachineInstrBuilder MIB; 15707 15708 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 15709 const int64_t SPOffset = 2 * PVT.getStoreSize(); 15710 15711 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; 15712 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; 15713 15714 // Reload FP 15715 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); 15716 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 15717 MIB.addOperand(MI->getOperand(i)); 15718 MIB.setMemRefs(MMOBegin, MMOEnd); 15719 // Reload IP 15720 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); 15721 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 15722 if (i == X86::AddrDisp) 15723 MIB.addDisp(MI->getOperand(i), LabelOffset); 15724 else 15725 MIB.addOperand(MI->getOperand(i)); 15726 } 15727 MIB.setMemRefs(MMOBegin, MMOEnd); 15728 // Reload SP 15729 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); 15730 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 15731 if (i == X86::AddrDisp) 15732 MIB.addDisp(MI->getOperand(i), SPOffset); 15733 else 15734 MIB.addOperand(MI->getOperand(i)); 15735 } 15736 MIB.setMemRefs(MMOBegin, MMOEnd); 15737 // Jump 15738 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); 15739 15740 MI->eraseFromParent(); 15741 return MBB; 15742} 15743 15744MachineBasicBlock * 15745X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 15746 MachineBasicBlock *BB) const { 15747 switch (MI->getOpcode()) { 15748 default: llvm_unreachable("Unexpected instr type to insert"); 15749 case X86::TAILJMPd64: 15750 case X86::TAILJMPr64: 15751 case X86::TAILJMPm64: 15752 llvm_unreachable("TAILJMP64 would not be touched here."); 15753 case X86::TCRETURNdi64: 15754 case X86::TCRETURNri64: 15755 case X86::TCRETURNmi64: 15756 return BB; 15757 case X86::WIN_ALLOCA: 15758 return EmitLoweredWinAlloca(MI, BB); 15759 case X86::SEG_ALLOCA_32: 15760 return EmitLoweredSegAlloca(MI, BB, false); 15761 case X86::SEG_ALLOCA_64: 15762 return EmitLoweredSegAlloca(MI, BB, true); 15763 case X86::TLSCall_32: 15764 case X86::TLSCall_64: 15765 return EmitLoweredTLSCall(MI, BB); 15766 case X86::CMOV_GR8: 15767 case X86::CMOV_FR32: 15768 case X86::CMOV_FR64: 15769 case X86::CMOV_V4F32: 15770 case X86::CMOV_V2F64: 15771 case X86::CMOV_V2I64: 15772 case X86::CMOV_V8F32: 15773 case X86::CMOV_V4F64: 15774 case X86::CMOV_V4I64: 15775 case X86::CMOV_V16F32: 15776 case X86::CMOV_V8F64: 15777 case X86::CMOV_V8I64: 15778 case X86::CMOV_GR16: 15779 case X86::CMOV_GR32: 15780 case X86::CMOV_RFP32: 15781 case X86::CMOV_RFP64: 15782 case X86::CMOV_RFP80: 15783 return EmitLoweredSelect(MI, BB); 15784 15785 case X86::FP32_TO_INT16_IN_MEM: 15786 case X86::FP32_TO_INT32_IN_MEM: 15787 case X86::FP32_TO_INT64_IN_MEM: 15788 case X86::FP64_TO_INT16_IN_MEM: 15789 case X86::FP64_TO_INT32_IN_MEM: 15790 case X86::FP64_TO_INT64_IN_MEM: 15791 case X86::FP80_TO_INT16_IN_MEM: 15792 case X86::FP80_TO_INT32_IN_MEM: 15793 case X86::FP80_TO_INT64_IN_MEM: { 15794 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 15795 DebugLoc DL = MI->getDebugLoc(); 15796 15797 // Change the floating point control register to use "round towards zero" 15798 // mode when truncating to an integer value. 15799 MachineFunction *F = BB->getParent(); 15800 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 15801 addFrameReference(BuildMI(*BB, MI, DL, 15802 TII->get(X86::FNSTCW16m)), CWFrameIdx); 15803 15804 // Load the old value of the high byte of the control word... 15805 unsigned OldCW = 15806 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); 15807 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 15808 CWFrameIdx); 15809 15810 // Set the high part to be round to zero... 15811 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 15812 .addImm(0xC7F); 15813 15814 // Reload the modified control word now... 15815 addFrameReference(BuildMI(*BB, MI, DL, 15816 TII->get(X86::FLDCW16m)), CWFrameIdx); 15817 15818 // Restore the memory image of control word to original value 15819 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 15820 .addReg(OldCW); 15821 15822 // Get the X86 opcode to use. 15823 unsigned Opc; 15824 switch (MI->getOpcode()) { 15825 default: llvm_unreachable("illegal opcode!"); 15826 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 15827 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 15828 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 15829 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 15830 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 15831 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 15832 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 15833 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 15834 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 15835 } 15836 15837 X86AddressMode AM; 15838 MachineOperand &Op = MI->getOperand(0); 15839 if (Op.isReg()) { 15840 AM.BaseType = X86AddressMode::RegBase; 15841 AM.Base.Reg = Op.getReg(); 15842 } else { 15843 AM.BaseType = X86AddressMode::FrameIndexBase; 15844 AM.Base.FrameIndex = Op.getIndex(); 15845 } 15846 Op = MI->getOperand(1); 15847 if (Op.isImm()) 15848 AM.Scale = Op.getImm(); 15849 Op = MI->getOperand(2); 15850 if (Op.isImm()) 15851 AM.IndexReg = Op.getImm(); 15852 Op = MI->getOperand(3); 15853 if (Op.isGlobal()) { 15854 AM.GV = Op.getGlobal(); 15855 } else { 15856 AM.Disp = Op.getImm(); 15857 } 15858 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 15859 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 15860 15861 // Reload the original control word now. 15862 addFrameReference(BuildMI(*BB, MI, DL, 15863 TII->get(X86::FLDCW16m)), CWFrameIdx); 15864 15865 MI->eraseFromParent(); // The pseudo instruction is gone now. 15866 return BB; 15867 } 15868 // String/text processing lowering. 15869 case X86::PCMPISTRM128REG: 15870 case X86::VPCMPISTRM128REG: 15871 case X86::PCMPISTRM128MEM: 15872 case X86::VPCMPISTRM128MEM: 15873 case X86::PCMPESTRM128REG: 15874 case X86::VPCMPESTRM128REG: 15875 case X86::PCMPESTRM128MEM: 15876 case X86::VPCMPESTRM128MEM: 15877 assert(Subtarget->hasSSE42() && 15878 "Target must have SSE4.2 or AVX features enabled"); 15879 return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo()); 15880 15881 // String/text processing lowering. 15882 case X86::PCMPISTRIREG: 15883 case X86::VPCMPISTRIREG: 15884 case X86::PCMPISTRIMEM: 15885 case X86::VPCMPISTRIMEM: 15886 case X86::PCMPESTRIREG: 15887 case X86::VPCMPESTRIREG: 15888 case X86::PCMPESTRIMEM: 15889 case X86::VPCMPESTRIMEM: 15890 assert(Subtarget->hasSSE42() && 15891 "Target must have SSE4.2 or AVX features enabled"); 15892 return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo()); 15893 15894 // Thread synchronization. 15895 case X86::MONITOR: 15896 return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget); 15897 15898 // xbegin 15899 case X86::XBEGIN: 15900 return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo()); 15901 15902 // Atomic Lowering. 15903 case X86::ATOMAND8: 15904 case X86::ATOMAND16: 15905 case X86::ATOMAND32: 15906 case X86::ATOMAND64: 15907 // Fall through 15908 case X86::ATOMOR8: 15909 case X86::ATOMOR16: 15910 case X86::ATOMOR32: 15911 case X86::ATOMOR64: 15912 // Fall through 15913 case X86::ATOMXOR16: 15914 case X86::ATOMXOR8: 15915 case X86::ATOMXOR32: 15916 case X86::ATOMXOR64: 15917 // Fall through 15918 case X86::ATOMNAND8: 15919 case X86::ATOMNAND16: 15920 case X86::ATOMNAND32: 15921 case X86::ATOMNAND64: 15922 // Fall through 15923 case X86::ATOMMAX8: 15924 case X86::ATOMMAX16: 15925 case X86::ATOMMAX32: 15926 case X86::ATOMMAX64: 15927 // Fall through 15928 case X86::ATOMMIN8: 15929 case X86::ATOMMIN16: 15930 case X86::ATOMMIN32: 15931 case X86::ATOMMIN64: 15932 // Fall through 15933 case X86::ATOMUMAX8: 15934 case X86::ATOMUMAX16: 15935 case X86::ATOMUMAX32: 15936 case X86::ATOMUMAX64: 15937 // Fall through 15938 case X86::ATOMUMIN8: 15939 case X86::ATOMUMIN16: 15940 case X86::ATOMUMIN32: 15941 case X86::ATOMUMIN64: 15942 return EmitAtomicLoadArith(MI, BB); 15943 15944 // This group does 64-bit operations on a 32-bit host. 15945 case X86::ATOMAND6432: 15946 case X86::ATOMOR6432: 15947 case X86::ATOMXOR6432: 15948 case X86::ATOMNAND6432: 15949 case X86::ATOMADD6432: 15950 case X86::ATOMSUB6432: 15951 case X86::ATOMMAX6432: 15952 case X86::ATOMMIN6432: 15953 case X86::ATOMUMAX6432: 15954 case X86::ATOMUMIN6432: 15955 case X86::ATOMSWAP6432: 15956 return EmitAtomicLoadArith6432(MI, BB); 15957 15958 case X86::VASTART_SAVE_XMM_REGS: 15959 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 15960 15961 case X86::VAARG_64: 15962 return EmitVAARG64WithCustomInserter(MI, BB); 15963 15964 case X86::EH_SjLj_SetJmp32: 15965 case X86::EH_SjLj_SetJmp64: 15966 return emitEHSjLjSetJmp(MI, BB); 15967 15968 case X86::EH_SjLj_LongJmp32: 15969 case X86::EH_SjLj_LongJmp64: 15970 return emitEHSjLjLongJmp(MI, BB); 15971 } 15972} 15973 15974//===----------------------------------------------------------------------===// 15975// X86 Optimization Hooks 15976//===----------------------------------------------------------------------===// 15977 15978void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 15979 APInt &KnownZero, 15980 APInt &KnownOne, 15981 const SelectionDAG &DAG, 15982 unsigned Depth) const { 15983 unsigned BitWidth = KnownZero.getBitWidth(); 15984 unsigned Opc = Op.getOpcode(); 15985 assert((Opc >= ISD::BUILTIN_OP_END || 15986 Opc == ISD::INTRINSIC_WO_CHAIN || 15987 Opc == ISD::INTRINSIC_W_CHAIN || 15988 Opc == ISD::INTRINSIC_VOID) && 15989 "Should use MaskedValueIsZero if you don't know whether Op" 15990 " is a target node!"); 15991 15992 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. 15993 switch (Opc) { 15994 default: break; 15995 case X86ISD::ADD: 15996 case X86ISD::SUB: 15997 case X86ISD::ADC: 15998 case X86ISD::SBB: 15999 case X86ISD::SMUL: 16000 case X86ISD::UMUL: 16001 case X86ISD::INC: 16002 case X86ISD::DEC: 16003 case X86ISD::OR: 16004 case X86ISD::XOR: 16005 case X86ISD::AND: 16006 // These nodes' second result is a boolean. 16007 if (Op.getResNo() == 0) 16008 break; 16009 // Fallthrough 16010 case X86ISD::SETCC: 16011 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 16012 break; 16013 case ISD::INTRINSIC_WO_CHAIN: { 16014 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 16015 unsigned NumLoBits = 0; 16016 switch (IntId) { 16017 default: break; 16018 case Intrinsic::x86_sse_movmsk_ps: 16019 case Intrinsic::x86_avx_movmsk_ps_256: 16020 case Intrinsic::x86_sse2_movmsk_pd: 16021 case Intrinsic::x86_avx_movmsk_pd_256: 16022 case Intrinsic::x86_mmx_pmovmskb: 16023 case Intrinsic::x86_sse2_pmovmskb_128: 16024 case Intrinsic::x86_avx2_pmovmskb: { 16025 // High bits of movmskp{s|d}, pmovmskb are known zero. 16026 switch (IntId) { 16027 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 16028 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 16029 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 16030 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 16031 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 16032 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 16033 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 16034 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; 16035 } 16036 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); 16037 break; 16038 } 16039 } 16040 break; 16041 } 16042 } 16043} 16044 16045unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 16046 unsigned Depth) const { 16047 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 16048 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 16049 return Op.getValueType().getScalarType().getSizeInBits(); 16050 16051 // Fallback case. 16052 return 1; 16053} 16054 16055/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 16056/// node is a GlobalAddress + offset. 16057bool X86TargetLowering::isGAPlusOffset(SDNode *N, 16058 const GlobalValue* &GA, 16059 int64_t &Offset) const { 16060 if (N->getOpcode() == X86ISD::Wrapper) { 16061 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 16062 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 16063 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 16064 return true; 16065 } 16066 } 16067 return TargetLowering::isGAPlusOffset(N, GA, Offset); 16068} 16069 16070/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 16071/// same as extracting the high 128-bit part of 256-bit vector and then 16072/// inserting the result into the low part of a new 256-bit vector 16073static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 16074 EVT VT = SVOp->getValueType(0); 16075 unsigned NumElems = VT.getVectorNumElements(); 16076 16077 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 16078 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) 16079 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 16080 SVOp->getMaskElt(j) >= 0) 16081 return false; 16082 16083 return true; 16084} 16085 16086/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 16087/// same as extracting the low 128-bit part of 256-bit vector and then 16088/// inserting the result into the high part of a new 256-bit vector 16089static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 16090 EVT VT = SVOp->getValueType(0); 16091 unsigned NumElems = VT.getVectorNumElements(); 16092 16093 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 16094 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) 16095 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 16096 SVOp->getMaskElt(j) >= 0) 16097 return false; 16098 16099 return true; 16100} 16101 16102/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 16103static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 16104 TargetLowering::DAGCombinerInfo &DCI, 16105 const X86Subtarget* Subtarget) { 16106 SDLoc dl(N); 16107 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 16108 SDValue V1 = SVOp->getOperand(0); 16109 SDValue V2 = SVOp->getOperand(1); 16110 EVT VT = SVOp->getValueType(0); 16111 unsigned NumElems = VT.getVectorNumElements(); 16112 16113 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 16114 V2.getOpcode() == ISD::CONCAT_VECTORS) { 16115 // 16116 // 0,0,0,... 16117 // | 16118 // V UNDEF BUILD_VECTOR UNDEF 16119 // \ / \ / 16120 // CONCAT_VECTOR CONCAT_VECTOR 16121 // \ / 16122 // \ / 16123 // RESULT: V + zero extended 16124 // 16125 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 16126 V2.getOperand(1).getOpcode() != ISD::UNDEF || 16127 V1.getOperand(1).getOpcode() != ISD::UNDEF) 16128 return SDValue(); 16129 16130 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 16131 return SDValue(); 16132 16133 // To match the shuffle mask, the first half of the mask should 16134 // be exactly the first vector, and all the rest a splat with the 16135 // first element of the second one. 16136 for (unsigned i = 0; i != NumElems/2; ++i) 16137 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 16138 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 16139 return SDValue(); 16140 16141 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 16142 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 16143 if (Ld->hasNUsesOfValue(1, 0)) { 16144 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 16145 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 16146 SDValue ResNode = 16147 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 16148 array_lengthof(Ops), 16149 Ld->getMemoryVT(), 16150 Ld->getPointerInfo(), 16151 Ld->getAlignment(), 16152 false/*isVolatile*/, true/*ReadMem*/, 16153 false/*WriteMem*/); 16154 16155 // Make sure the newly-created LOAD is in the same position as Ld in 16156 // terms of dependency. We create a TokenFactor for Ld and ResNode, 16157 // and update uses of Ld's output chain to use the TokenFactor. 16158 if (Ld->hasAnyUseOfValue(1)) { 16159 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 16160 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); 16161 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); 16162 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), 16163 SDValue(ResNode.getNode(), 1)); 16164 } 16165 16166 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); 16167 } 16168 } 16169 16170 // Emit a zeroed vector and insert the desired subvector on its 16171 // first half. 16172 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 16173 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); 16174 return DCI.CombineTo(N, InsV); 16175 } 16176 16177 //===--------------------------------------------------------------------===// 16178 // Combine some shuffles into subvector extracts and inserts: 16179 // 16180 16181 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 16182 if (isShuffleHigh128VectorInsertLow(SVOp)) { 16183 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); 16184 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); 16185 return DCI.CombineTo(N, InsV); 16186 } 16187 16188 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 16189 if (isShuffleLow128VectorInsertHigh(SVOp)) { 16190 SDValue V = Extract128BitVector(V1, 0, DAG, dl); 16191 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); 16192 return DCI.CombineTo(N, InsV); 16193 } 16194 16195 return SDValue(); 16196} 16197 16198/// PerformShuffleCombine - Performs several different shuffle combines. 16199static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 16200 TargetLowering::DAGCombinerInfo &DCI, 16201 const X86Subtarget *Subtarget) { 16202 SDLoc dl(N); 16203 EVT VT = N->getValueType(0); 16204 16205 // Don't create instructions with illegal types after legalize types has run. 16206 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16207 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 16208 return SDValue(); 16209 16210 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 16211 if (Subtarget->hasFp256() && VT.is256BitVector() && 16212 N->getOpcode() == ISD::VECTOR_SHUFFLE) 16213 return PerformShuffleCombine256(N, DAG, DCI, Subtarget); 16214 16215 // Only handle 128 wide vector from here on. 16216 if (!VT.is128BitVector()) 16217 return SDValue(); 16218 16219 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 16220 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 16221 // consecutive, non-overlapping, and in the right order. 16222 SmallVector<SDValue, 16> Elts; 16223 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 16224 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 16225 16226 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 16227} 16228 16229/// PerformTruncateCombine - Converts truncate operation to 16230/// a sequence of vector shuffle operations. 16231/// It is possible when we truncate 256-bit vector to 128-bit vector 16232static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 16233 TargetLowering::DAGCombinerInfo &DCI, 16234 const X86Subtarget *Subtarget) { 16235 return SDValue(); 16236} 16237 16238/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target 16239/// specific shuffle of a load can be folded into a single element load. 16240/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but 16241/// shuffles have been customed lowered so we need to handle those here. 16242static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, 16243 TargetLowering::DAGCombinerInfo &DCI) { 16244 if (DCI.isBeforeLegalizeOps()) 16245 return SDValue(); 16246 16247 SDValue InVec = N->getOperand(0); 16248 SDValue EltNo = N->getOperand(1); 16249 16250 if (!isa<ConstantSDNode>(EltNo)) 16251 return SDValue(); 16252 16253 EVT VT = InVec.getValueType(); 16254 16255 bool HasShuffleIntoBitcast = false; 16256 if (InVec.getOpcode() == ISD::BITCAST) { 16257 // Don't duplicate a load with other uses. 16258 if (!InVec.hasOneUse()) 16259 return SDValue(); 16260 EVT BCVT = InVec.getOperand(0).getValueType(); 16261 if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) 16262 return SDValue(); 16263 InVec = InVec.getOperand(0); 16264 HasShuffleIntoBitcast = true; 16265 } 16266 16267 if (!isTargetShuffle(InVec.getOpcode())) 16268 return SDValue(); 16269 16270 // Don't duplicate a load with other uses. 16271 if (!InVec.hasOneUse()) 16272 return SDValue(); 16273 16274 SmallVector<int, 16> ShuffleMask; 16275 bool UnaryShuffle; 16276 if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, 16277 UnaryShuffle)) 16278 return SDValue(); 16279 16280 // Select the input vector, guarding against out of range extract vector. 16281 unsigned NumElems = VT.getVectorNumElements(); 16282 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 16283 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; 16284 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) 16285 : InVec.getOperand(1); 16286 16287 // If inputs to shuffle are the same for both ops, then allow 2 uses 16288 unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; 16289 16290 if (LdNode.getOpcode() == ISD::BITCAST) { 16291 // Don't duplicate a load with other uses. 16292 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) 16293 return SDValue(); 16294 16295 AllowedUses = 1; // only allow 1 load use if we have a bitcast 16296 LdNode = LdNode.getOperand(0); 16297 } 16298 16299 if (!ISD::isNormalLoad(LdNode.getNode())) 16300 return SDValue(); 16301 16302 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); 16303 16304 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) 16305 return SDValue(); 16306 16307 if (HasShuffleIntoBitcast) { 16308 // If there's a bitcast before the shuffle, check if the load type and 16309 // alignment is valid. 16310 unsigned Align = LN0->getAlignment(); 16311 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16312 unsigned NewAlign = TLI.getDataLayout()-> 16313 getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); 16314 16315 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 16316 return SDValue(); 16317 } 16318 16319 // All checks match so transform back to vector_shuffle so that DAG combiner 16320 // can finish the job 16321 SDLoc dl(N); 16322 16323 // Create shuffle node taking into account the case that its a unary shuffle 16324 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); 16325 Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, 16326 InVec.getOperand(0), Shuffle, 16327 &ShuffleMask[0]); 16328 Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 16329 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, 16330 EltNo); 16331} 16332 16333/// Extract one bit from mask vector, like v16i1 or v8i1. 16334/// AVX-512 feature. 16335static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) { 16336 SDValue Vec = N->getOperand(0); 16337 SDLoc dl(Vec); 16338 MVT VecVT = Vec.getSimpleValueType(); 16339 SDValue Idx = N->getOperand(1); 16340 MVT EltVT = N->getSimpleValueType(0); 16341 16342 assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) || 16343 "Unexpected operands in ExtractBitFromMaskVector"); 16344 16345 // variable index 16346 if (!isa<ConstantSDNode>(Idx)) { 16347 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 16348 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); 16349 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 16350 ExtVT.getVectorElementType(), Ext); 16351 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 16352 } 16353 16354 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 16355 16356 MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits()); 16357 unsigned MaxShift = VecVT.getSizeInBits() - 1; 16358 Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec); 16359 Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, 16360 DAG.getConstant(MaxShift - IdxVal, ScalarVT)); 16361 Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec, 16362 DAG.getConstant(MaxShift, ScalarVT)); 16363 16364 if (VecVT == MVT::v16i1) { 16365 Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec); 16366 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec); 16367 } 16368 return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec); 16369} 16370 16371/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 16372/// generation and convert it from being a bunch of shuffles and extracts 16373/// to a simple store and scalar loads to extract the elements. 16374static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 16375 TargetLowering::DAGCombinerInfo &DCI) { 16376 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); 16377 if (NewOp.getNode()) 16378 return NewOp; 16379 16380 SDValue InputVector = N->getOperand(0); 16381 16382 if (InputVector.getValueType().getVectorElementType() == MVT::i1 && 16383 !DCI.isBeforeLegalize()) 16384 return ExtractBitFromMaskVector(N, DAG); 16385 16386 // Detect whether we are trying to convert from mmx to i32 and the bitcast 16387 // from mmx to v2i32 has a single usage. 16388 if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && 16389 InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && 16390 InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) 16391 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), 16392 N->getValueType(0), 16393 InputVector.getNode()->getOperand(0)); 16394 16395 // Only operate on vectors of 4 elements, where the alternative shuffling 16396 // gets to be more expensive. 16397 if (InputVector.getValueType() != MVT::v4i32) 16398 return SDValue(); 16399 16400 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 16401 // single use which is a sign-extend or zero-extend, and all elements are 16402 // used. 16403 SmallVector<SDNode *, 4> Uses; 16404 unsigned ExtractedElements = 0; 16405 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 16406 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 16407 if (UI.getUse().getResNo() != InputVector.getResNo()) 16408 return SDValue(); 16409 16410 SDNode *Extract = *UI; 16411 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 16412 return SDValue(); 16413 16414 if (Extract->getValueType(0) != MVT::i32) 16415 return SDValue(); 16416 if (!Extract->hasOneUse()) 16417 return SDValue(); 16418 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 16419 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 16420 return SDValue(); 16421 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 16422 return SDValue(); 16423 16424 // Record which element was extracted. 16425 ExtractedElements |= 16426 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 16427 16428 Uses.push_back(Extract); 16429 } 16430 16431 // If not all the elements were used, this may not be worthwhile. 16432 if (ExtractedElements != 15) 16433 return SDValue(); 16434 16435 // Ok, we've now decided to do the transformation. 16436 SDLoc dl(InputVector); 16437 16438 // Store the value to a temporary stack slot. 16439 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 16440 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 16441 MachinePointerInfo(), false, false, 0); 16442 16443 // Replace each use (extract) with a load of the appropriate element. 16444 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 16445 UE = Uses.end(); UI != UE; ++UI) { 16446 SDNode *Extract = *UI; 16447 16448 // cOMpute the element's address. 16449 SDValue Idx = Extract->getOperand(1); 16450 unsigned EltSize = 16451 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 16452 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 16453 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16454 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 16455 16456 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 16457 StackPtr, OffsetVal); 16458 16459 // Load the scalar. 16460 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 16461 ScalarAddr, MachinePointerInfo(), 16462 false, false, false, 0); 16463 16464 // Replace the exact with the load. 16465 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 16466 } 16467 16468 // The replacement was made in place; don't return anything. 16469 return SDValue(); 16470} 16471 16472/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. 16473static std::pair<unsigned, bool> 16474matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, 16475 SelectionDAG &DAG, const X86Subtarget *Subtarget) { 16476 if (!VT.isVector()) 16477 return std::make_pair(0, false); 16478 16479 bool NeedSplit = false; 16480 switch (VT.getSimpleVT().SimpleTy) { 16481 default: return std::make_pair(0, false); 16482 case MVT::v32i8: 16483 case MVT::v16i16: 16484 case MVT::v8i32: 16485 if (!Subtarget->hasAVX2()) 16486 NeedSplit = true; 16487 if (!Subtarget->hasAVX()) 16488 return std::make_pair(0, false); 16489 break; 16490 case MVT::v16i8: 16491 case MVT::v8i16: 16492 case MVT::v4i32: 16493 if (!Subtarget->hasSSE2()) 16494 return std::make_pair(0, false); 16495 } 16496 16497 // SSE2 has only a small subset of the operations. 16498 bool hasUnsigned = Subtarget->hasSSE41() || 16499 (Subtarget->hasSSE2() && VT == MVT::v16i8); 16500 bool hasSigned = Subtarget->hasSSE41() || 16501 (Subtarget->hasSSE2() && VT == MVT::v8i16); 16502 16503 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16504 16505 unsigned Opc = 0; 16506 // Check for x CC y ? x : y. 16507 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 16508 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 16509 switch (CC) { 16510 default: break; 16511 case ISD::SETULT: 16512 case ISD::SETULE: 16513 Opc = hasUnsigned ? X86ISD::UMIN : 0; break; 16514 case ISD::SETUGT: 16515 case ISD::SETUGE: 16516 Opc = hasUnsigned ? X86ISD::UMAX : 0; break; 16517 case ISD::SETLT: 16518 case ISD::SETLE: 16519 Opc = hasSigned ? X86ISD::SMIN : 0; break; 16520 case ISD::SETGT: 16521 case ISD::SETGE: 16522 Opc = hasSigned ? X86ISD::SMAX : 0; break; 16523 } 16524 // Check for x CC y ? y : x -- a min/max with reversed arms. 16525 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 16526 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 16527 switch (CC) { 16528 default: break; 16529 case ISD::SETULT: 16530 case ISD::SETULE: 16531 Opc = hasUnsigned ? X86ISD::UMAX : 0; break; 16532 case ISD::SETUGT: 16533 case ISD::SETUGE: 16534 Opc = hasUnsigned ? X86ISD::UMIN : 0; break; 16535 case ISD::SETLT: 16536 case ISD::SETLE: 16537 Opc = hasSigned ? X86ISD::SMAX : 0; break; 16538 case ISD::SETGT: 16539 case ISD::SETGE: 16540 Opc = hasSigned ? X86ISD::SMIN : 0; break; 16541 } 16542 } 16543 16544 return std::make_pair(Opc, NeedSplit); 16545} 16546 16547/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 16548/// nodes. 16549static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 16550 TargetLowering::DAGCombinerInfo &DCI, 16551 const X86Subtarget *Subtarget) { 16552 SDLoc DL(N); 16553 SDValue Cond = N->getOperand(0); 16554 // Get the LHS/RHS of the select. 16555 SDValue LHS = N->getOperand(1); 16556 SDValue RHS = N->getOperand(2); 16557 EVT VT = LHS.getValueType(); 16558 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16559 16560 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 16561 // instructions match the semantics of the common C idiom x<y?x:y but not 16562 // x<=y?x:y, because of how they handle negative zero (which can be 16563 // ignored in unsafe-math mode). 16564 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 16565 VT != MVT::f80 && TLI.isTypeLegal(VT) && 16566 (Subtarget->hasSSE2() || 16567 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 16568 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16569 16570 unsigned Opcode = 0; 16571 // Check for x CC y ? x : y. 16572 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 16573 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 16574 switch (CC) { 16575 default: break; 16576 case ISD::SETULT: 16577 // Converting this to a min would handle NaNs incorrectly, and swapping 16578 // the operands would cause it to handle comparisons between positive 16579 // and negative zero incorrectly. 16580 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 16581 if (!DAG.getTarget().Options.UnsafeFPMath && 16582 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 16583 break; 16584 std::swap(LHS, RHS); 16585 } 16586 Opcode = X86ISD::FMIN; 16587 break; 16588 case ISD::SETOLE: 16589 // Converting this to a min would handle comparisons between positive 16590 // and negative zero incorrectly. 16591 if (!DAG.getTarget().Options.UnsafeFPMath && 16592 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 16593 break; 16594 Opcode = X86ISD::FMIN; 16595 break; 16596 case ISD::SETULE: 16597 // Converting this to a min would handle both negative zeros and NaNs 16598 // incorrectly, but we can swap the operands to fix both. 16599 std::swap(LHS, RHS); 16600 case ISD::SETOLT: 16601 case ISD::SETLT: 16602 case ISD::SETLE: 16603 Opcode = X86ISD::FMIN; 16604 break; 16605 16606 case ISD::SETOGE: 16607 // Converting this to a max would handle comparisons between positive 16608 // and negative zero incorrectly. 16609 if (!DAG.getTarget().Options.UnsafeFPMath && 16610 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 16611 break; 16612 Opcode = X86ISD::FMAX; 16613 break; 16614 case ISD::SETUGT: 16615 // Converting this to a max would handle NaNs incorrectly, and swapping 16616 // the operands would cause it to handle comparisons between positive 16617 // and negative zero incorrectly. 16618 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 16619 if (!DAG.getTarget().Options.UnsafeFPMath && 16620 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 16621 break; 16622 std::swap(LHS, RHS); 16623 } 16624 Opcode = X86ISD::FMAX; 16625 break; 16626 case ISD::SETUGE: 16627 // Converting this to a max would handle both negative zeros and NaNs 16628 // incorrectly, but we can swap the operands to fix both. 16629 std::swap(LHS, RHS); 16630 case ISD::SETOGT: 16631 case ISD::SETGT: 16632 case ISD::SETGE: 16633 Opcode = X86ISD::FMAX; 16634 break; 16635 } 16636 // Check for x CC y ? y : x -- a min/max with reversed arms. 16637 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 16638 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 16639 switch (CC) { 16640 default: break; 16641 case ISD::SETOGE: 16642 // Converting this to a min would handle comparisons between positive 16643 // and negative zero incorrectly, and swapping the operands would 16644 // cause it to handle NaNs incorrectly. 16645 if (!DAG.getTarget().Options.UnsafeFPMath && 16646 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 16647 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 16648 break; 16649 std::swap(LHS, RHS); 16650 } 16651 Opcode = X86ISD::FMIN; 16652 break; 16653 case ISD::SETUGT: 16654 // Converting this to a min would handle NaNs incorrectly. 16655 if (!DAG.getTarget().Options.UnsafeFPMath && 16656 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 16657 break; 16658 Opcode = X86ISD::FMIN; 16659 break; 16660 case ISD::SETUGE: 16661 // Converting this to a min would handle both negative zeros and NaNs 16662 // incorrectly, but we can swap the operands to fix both. 16663 std::swap(LHS, RHS); 16664 case ISD::SETOGT: 16665 case ISD::SETGT: 16666 case ISD::SETGE: 16667 Opcode = X86ISD::FMIN; 16668 break; 16669 16670 case ISD::SETULT: 16671 // Converting this to a max would handle NaNs incorrectly. 16672 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 16673 break; 16674 Opcode = X86ISD::FMAX; 16675 break; 16676 case ISD::SETOLE: 16677 // Converting this to a max would handle comparisons between positive 16678 // and negative zero incorrectly, and swapping the operands would 16679 // cause it to handle NaNs incorrectly. 16680 if (!DAG.getTarget().Options.UnsafeFPMath && 16681 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 16682 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 16683 break; 16684 std::swap(LHS, RHS); 16685 } 16686 Opcode = X86ISD::FMAX; 16687 break; 16688 case ISD::SETULE: 16689 // Converting this to a max would handle both negative zeros and NaNs 16690 // incorrectly, but we can swap the operands to fix both. 16691 std::swap(LHS, RHS); 16692 case ISD::SETOLT: 16693 case ISD::SETLT: 16694 case ISD::SETLE: 16695 Opcode = X86ISD::FMAX; 16696 break; 16697 } 16698 } 16699 16700 if (Opcode) 16701 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 16702 } 16703 16704 EVT CondVT = Cond.getValueType(); 16705 if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && 16706 CondVT.getVectorElementType() == MVT::i1) { 16707 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper 16708 // lowering on AVX-512. In this case we convert it to 16709 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. 16710 // The same situation for all 128 and 256-bit vectors of i8 and i16 16711 EVT OpVT = LHS.getValueType(); 16712 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && 16713 (OpVT.getVectorElementType() == MVT::i8 || 16714 OpVT.getVectorElementType() == MVT::i16)) { 16715 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); 16716 DCI.AddToWorklist(Cond.getNode()); 16717 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); 16718 } 16719 } 16720 // If this is a select between two integer constants, try to do some 16721 // optimizations. 16722 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 16723 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 16724 // Don't do this for crazy integer types. 16725 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 16726 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 16727 // so that TrueC (the true value) is larger than FalseC. 16728 bool NeedsCondInvert = false; 16729 16730 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 16731 // Efficiently invertible. 16732 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 16733 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 16734 isa<ConstantSDNode>(Cond.getOperand(1))))) { 16735 NeedsCondInvert = true; 16736 std::swap(TrueC, FalseC); 16737 } 16738 16739 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 16740 if (FalseC->getAPIntValue() == 0 && 16741 TrueC->getAPIntValue().isPowerOf2()) { 16742 if (NeedsCondInvert) // Invert the condition if needed. 16743 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 16744 DAG.getConstant(1, Cond.getValueType())); 16745 16746 // Zero extend the condition if needed. 16747 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 16748 16749 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 16750 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 16751 DAG.getConstant(ShAmt, MVT::i8)); 16752 } 16753 16754 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 16755 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 16756 if (NeedsCondInvert) // Invert the condition if needed. 16757 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 16758 DAG.getConstant(1, Cond.getValueType())); 16759 16760 // Zero extend the condition if needed. 16761 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 16762 FalseC->getValueType(0), Cond); 16763 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 16764 SDValue(FalseC, 0)); 16765 } 16766 16767 // Optimize cases that will turn into an LEA instruction. This requires 16768 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 16769 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 16770 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 16771 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 16772 16773 bool isFastMultiplier = false; 16774 if (Diff < 10) { 16775 switch ((unsigned char)Diff) { 16776 default: break; 16777 case 1: // result = add base, cond 16778 case 2: // result = lea base( , cond*2) 16779 case 3: // result = lea base(cond, cond*2) 16780 case 4: // result = lea base( , cond*4) 16781 case 5: // result = lea base(cond, cond*4) 16782 case 8: // result = lea base( , cond*8) 16783 case 9: // result = lea base(cond, cond*8) 16784 isFastMultiplier = true; 16785 break; 16786 } 16787 } 16788 16789 if (isFastMultiplier) { 16790 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 16791 if (NeedsCondInvert) // Invert the condition if needed. 16792 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 16793 DAG.getConstant(1, Cond.getValueType())); 16794 16795 // Zero extend the condition if needed. 16796 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 16797 Cond); 16798 // Scale the condition by the difference. 16799 if (Diff != 1) 16800 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 16801 DAG.getConstant(Diff, Cond.getValueType())); 16802 16803 // Add the base if non-zero. 16804 if (FalseC->getAPIntValue() != 0) 16805 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 16806 SDValue(FalseC, 0)); 16807 return Cond; 16808 } 16809 } 16810 } 16811 } 16812 16813 // Canonicalize max and min: 16814 // (x > y) ? x : y -> (x >= y) ? x : y 16815 // (x < y) ? x : y -> (x <= y) ? x : y 16816 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 16817 // the need for an extra compare 16818 // against zero. e.g. 16819 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 16820 // subl %esi, %edi 16821 // testl %edi, %edi 16822 // movl $0, %eax 16823 // cmovgl %edi, %eax 16824 // => 16825 // xorl %eax, %eax 16826 // subl %esi, $edi 16827 // cmovsl %eax, %edi 16828 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 16829 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 16830 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 16831 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16832 switch (CC) { 16833 default: break; 16834 case ISD::SETLT: 16835 case ISD::SETGT: { 16836 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 16837 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), 16838 Cond.getOperand(0), Cond.getOperand(1), NewCC); 16839 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 16840 } 16841 } 16842 } 16843 16844 // Early exit check 16845 if (!TLI.isTypeLegal(VT)) 16846 return SDValue(); 16847 16848 // Match VSELECTs into subs with unsigned saturation. 16849 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 16850 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. 16851 ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || 16852 (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { 16853 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 16854 16855 // Check if one of the arms of the VSELECT is a zero vector. If it's on the 16856 // left side invert the predicate to simplify logic below. 16857 SDValue Other; 16858 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 16859 Other = RHS; 16860 CC = ISD::getSetCCInverse(CC, true); 16861 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { 16862 Other = LHS; 16863 } 16864 16865 if (Other.getNode() && Other->getNumOperands() == 2 && 16866 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { 16867 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); 16868 SDValue CondRHS = Cond->getOperand(1); 16869 16870 // Look for a general sub with unsigned saturation first. 16871 // x >= y ? x-y : 0 --> subus x, y 16872 // x > y ? x-y : 0 --> subus x, y 16873 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && 16874 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) 16875 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 16876 16877 // If the RHS is a constant we have to reverse the const canonicalization. 16878 // x > C-1 ? x+-C : 0 --> subus x, C 16879 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && 16880 isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { 16881 APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); 16882 if (CondRHS.getConstantOperandVal(0) == -A-1) 16883 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, 16884 DAG.getConstant(-A, VT)); 16885 } 16886 16887 // Another special case: If C was a sign bit, the sub has been 16888 // canonicalized into a xor. 16889 // FIXME: Would it be better to use ComputeMaskedBits to determine whether 16890 // it's safe to decanonicalize the xor? 16891 // x s< 0 ? x^C : 0 --> subus x, C 16892 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && 16893 ISD::isBuildVectorAllZeros(CondRHS.getNode()) && 16894 isSplatVector(OpRHS.getNode())) { 16895 APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); 16896 if (A.isSignBit()) 16897 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 16898 } 16899 } 16900 } 16901 16902 // Try to match a min/max vector operation. 16903 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { 16904 std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); 16905 unsigned Opc = ret.first; 16906 bool NeedSplit = ret.second; 16907 16908 if (Opc && NeedSplit) { 16909 unsigned NumElems = VT.getVectorNumElements(); 16910 // Extract the LHS vectors 16911 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); 16912 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); 16913 16914 // Extract the RHS vectors 16915 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); 16916 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); 16917 16918 // Create min/max for each subvector 16919 LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); 16920 RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); 16921 16922 // Merge the result 16923 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); 16924 } else if (Opc) 16925 return DAG.getNode(Opc, DL, VT, LHS, RHS); 16926 } 16927 16928 // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. 16929 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 16930 // Check if SETCC has already been promoted 16931 TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) { 16932 16933 assert(Cond.getValueType().isVector() && 16934 "vector select expects a vector selector!"); 16935 16936 EVT IntVT = Cond.getValueType(); 16937 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); 16938 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); 16939 16940 if (!TValIsAllOnes && !FValIsAllZeros) { 16941 // Try invert the condition if true value is not all 1s and false value 16942 // is not all 0s. 16943 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); 16944 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); 16945 16946 if (TValIsAllZeros || FValIsAllOnes) { 16947 SDValue CC = Cond.getOperand(2); 16948 ISD::CondCode NewCC = 16949 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 16950 Cond.getOperand(0).getValueType().isInteger()); 16951 Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); 16952 std::swap(LHS, RHS); 16953 TValIsAllOnes = FValIsAllOnes; 16954 FValIsAllZeros = TValIsAllZeros; 16955 } 16956 } 16957 16958 if (TValIsAllOnes || FValIsAllZeros) { 16959 SDValue Ret; 16960 16961 if (TValIsAllOnes && FValIsAllZeros) 16962 Ret = Cond; 16963 else if (TValIsAllOnes) 16964 Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond, 16965 DAG.getNode(ISD::BITCAST, DL, IntVT, RHS)); 16966 else if (FValIsAllZeros) 16967 Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond, 16968 DAG.getNode(ISD::BITCAST, DL, IntVT, LHS)); 16969 16970 return DAG.getNode(ISD::BITCAST, DL, VT, Ret); 16971 } 16972 } 16973 16974 // If we know that this node is legal then we know that it is going to be 16975 // matched by one of the SSE/AVX BLEND instructions. These instructions only 16976 // depend on the highest bit in each word. Try to use SimplifyDemandedBits 16977 // to simplify previous instructions. 16978 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 16979 !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { 16980 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); 16981 16982 // Don't optimize vector selects that map to mask-registers. 16983 if (BitWidth == 1) 16984 return SDValue(); 16985 16986 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 16987 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 16988 16989 APInt KnownZero, KnownOne; 16990 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 16991 DCI.isBeforeLegalizeOps()); 16992 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 16993 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) 16994 DCI.CommitTargetLoweringOpt(TLO); 16995 } 16996 16997 return SDValue(); 16998} 16999 17000// Check whether a boolean test is testing a boolean value generated by 17001// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition 17002// code. 17003// 17004// Simplify the following patterns: 17005// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or 17006// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) 17007// to (Op EFLAGS Cond) 17008// 17009// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or 17010// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) 17011// to (Op EFLAGS !Cond) 17012// 17013// where Op could be BRCOND or CMOV. 17014// 17015static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { 17016 // Quit if not CMP and SUB with its value result used. 17017 if (Cmp.getOpcode() != X86ISD::CMP && 17018 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) 17019 return SDValue(); 17020 17021 // Quit if not used as a boolean value. 17022 if (CC != X86::COND_E && CC != X86::COND_NE) 17023 return SDValue(); 17024 17025 // Check CMP operands. One of them should be 0 or 1 and the other should be 17026 // an SetCC or extended from it. 17027 SDValue Op1 = Cmp.getOperand(0); 17028 SDValue Op2 = Cmp.getOperand(1); 17029 17030 SDValue SetCC; 17031 const ConstantSDNode* C = 0; 17032 bool needOppositeCond = (CC == X86::COND_E); 17033 bool checkAgainstTrue = false; // Is it a comparison against 1? 17034 17035 if ((C = dyn_cast<ConstantSDNode>(Op1))) 17036 SetCC = Op2; 17037 else if ((C = dyn_cast<ConstantSDNode>(Op2))) 17038 SetCC = Op1; 17039 else // Quit if all operands are not constants. 17040 return SDValue(); 17041 17042 if (C->getZExtValue() == 1) { 17043 needOppositeCond = !needOppositeCond; 17044 checkAgainstTrue = true; 17045 } else if (C->getZExtValue() != 0) 17046 // Quit if the constant is neither 0 or 1. 17047 return SDValue(); 17048 17049 bool truncatedToBoolWithAnd = false; 17050 // Skip (zext $x), (trunc $x), or (and $x, 1) node. 17051 while (SetCC.getOpcode() == ISD::ZERO_EXTEND || 17052 SetCC.getOpcode() == ISD::TRUNCATE || 17053 SetCC.getOpcode() == ISD::AND) { 17054 if (SetCC.getOpcode() == ISD::AND) { 17055 int OpIdx = -1; 17056 ConstantSDNode *CS; 17057 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) && 17058 CS->getZExtValue() == 1) 17059 OpIdx = 1; 17060 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) && 17061 CS->getZExtValue() == 1) 17062 OpIdx = 0; 17063 if (OpIdx == -1) 17064 break; 17065 SetCC = SetCC.getOperand(OpIdx); 17066 truncatedToBoolWithAnd = true; 17067 } else 17068 SetCC = SetCC.getOperand(0); 17069 } 17070 17071 switch (SetCC.getOpcode()) { 17072 case X86ISD::SETCC_CARRY: 17073 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to 17074 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, 17075 // i.e. it's a comparison against true but the result of SETCC_CARRY is not 17076 // truncated to i1 using 'and'. 17077 if (checkAgainstTrue && !truncatedToBoolWithAnd) 17078 break; 17079 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && 17080 "Invalid use of SETCC_CARRY!"); 17081 // FALL THROUGH 17082 case X86ISD::SETCC: 17083 // Set the condition code or opposite one if necessary. 17084 CC = X86::CondCode(SetCC.getConstantOperandVal(0)); 17085 if (needOppositeCond) 17086 CC = X86::GetOppositeBranchCondition(CC); 17087 return SetCC.getOperand(1); 17088 case X86ISD::CMOV: { 17089 // Check whether false/true value has canonical one, i.e. 0 or 1. 17090 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); 17091 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); 17092 // Quit if true value is not a constant. 17093 if (!TVal) 17094 return SDValue(); 17095 // Quit if false value is not a constant. 17096 if (!FVal) { 17097 SDValue Op = SetCC.getOperand(0); 17098 // Skip 'zext' or 'trunc' node. 17099 if (Op.getOpcode() == ISD::ZERO_EXTEND || 17100 Op.getOpcode() == ISD::TRUNCATE) 17101 Op = Op.getOperand(0); 17102 // A special case for rdrand/rdseed, where 0 is set if false cond is 17103 // found. 17104 if ((Op.getOpcode() != X86ISD::RDRAND && 17105 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) 17106 return SDValue(); 17107 } 17108 // Quit if false value is not the constant 0 or 1. 17109 bool FValIsFalse = true; 17110 if (FVal && FVal->getZExtValue() != 0) { 17111 if (FVal->getZExtValue() != 1) 17112 return SDValue(); 17113 // If FVal is 1, opposite cond is needed. 17114 needOppositeCond = !needOppositeCond; 17115 FValIsFalse = false; 17116 } 17117 // Quit if TVal is not the constant opposite of FVal. 17118 if (FValIsFalse && TVal->getZExtValue() != 1) 17119 return SDValue(); 17120 if (!FValIsFalse && TVal->getZExtValue() != 0) 17121 return SDValue(); 17122 CC = X86::CondCode(SetCC.getConstantOperandVal(2)); 17123 if (needOppositeCond) 17124 CC = X86::GetOppositeBranchCondition(CC); 17125 return SetCC.getOperand(3); 17126 } 17127 } 17128 17129 return SDValue(); 17130} 17131 17132/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 17133static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 17134 TargetLowering::DAGCombinerInfo &DCI, 17135 const X86Subtarget *Subtarget) { 17136 SDLoc DL(N); 17137 17138 // If the flag operand isn't dead, don't touch this CMOV. 17139 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 17140 return SDValue(); 17141 17142 SDValue FalseOp = N->getOperand(0); 17143 SDValue TrueOp = N->getOperand(1); 17144 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 17145 SDValue Cond = N->getOperand(3); 17146 17147 if (CC == X86::COND_E || CC == X86::COND_NE) { 17148 switch (Cond.getOpcode()) { 17149 default: break; 17150 case X86ISD::BSR: 17151 case X86ISD::BSF: 17152 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 17153 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 17154 return (CC == X86::COND_E) ? FalseOp : TrueOp; 17155 } 17156 } 17157 17158 SDValue Flags; 17159 17160 Flags = checkBoolTestSetCCCombine(Cond, CC); 17161 if (Flags.getNode() && 17162 // Extra check as FCMOV only supports a subset of X86 cond. 17163 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { 17164 SDValue Ops[] = { FalseOp, TrueOp, 17165 DAG.getConstant(CC, MVT::i8), Flags }; 17166 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), 17167 Ops, array_lengthof(Ops)); 17168 } 17169 17170 // If this is a select between two integer constants, try to do some 17171 // optimizations. Note that the operands are ordered the opposite of SELECT 17172 // operands. 17173 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 17174 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 17175 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 17176 // larger than FalseC (the false value). 17177 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 17178 CC = X86::GetOppositeBranchCondition(CC); 17179 std::swap(TrueC, FalseC); 17180 std::swap(TrueOp, FalseOp); 17181 } 17182 17183 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 17184 // This is efficient for any integer data type (including i8/i16) and 17185 // shift amount. 17186 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 17187 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 17188 DAG.getConstant(CC, MVT::i8), Cond); 17189 17190 // Zero extend the condition if needed. 17191 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 17192 17193 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 17194 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 17195 DAG.getConstant(ShAmt, MVT::i8)); 17196 if (N->getNumValues() == 2) // Dead flag value? 17197 return DCI.CombineTo(N, Cond, SDValue()); 17198 return Cond; 17199 } 17200 17201 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 17202 // for any integer data type, including i8/i16. 17203 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 17204 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 17205 DAG.getConstant(CC, MVT::i8), Cond); 17206 17207 // Zero extend the condition if needed. 17208 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 17209 FalseC->getValueType(0), Cond); 17210 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 17211 SDValue(FalseC, 0)); 17212 17213 if (N->getNumValues() == 2) // Dead flag value? 17214 return DCI.CombineTo(N, Cond, SDValue()); 17215 return Cond; 17216 } 17217 17218 // Optimize cases that will turn into an LEA instruction. This requires 17219 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 17220 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 17221 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 17222 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 17223 17224 bool isFastMultiplier = false; 17225 if (Diff < 10) { 17226 switch ((unsigned char)Diff) { 17227 default: break; 17228 case 1: // result = add base, cond 17229 case 2: // result = lea base( , cond*2) 17230 case 3: // result = lea base(cond, cond*2) 17231 case 4: // result = lea base( , cond*4) 17232 case 5: // result = lea base(cond, cond*4) 17233 case 8: // result = lea base( , cond*8) 17234 case 9: // result = lea base(cond, cond*8) 17235 isFastMultiplier = true; 17236 break; 17237 } 17238 } 17239 17240 if (isFastMultiplier) { 17241 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 17242 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 17243 DAG.getConstant(CC, MVT::i8), Cond); 17244 // Zero extend the condition if needed. 17245 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 17246 Cond); 17247 // Scale the condition by the difference. 17248 if (Diff != 1) 17249 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 17250 DAG.getConstant(Diff, Cond.getValueType())); 17251 17252 // Add the base if non-zero. 17253 if (FalseC->getAPIntValue() != 0) 17254 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 17255 SDValue(FalseC, 0)); 17256 if (N->getNumValues() == 2) // Dead flag value? 17257 return DCI.CombineTo(N, Cond, SDValue()); 17258 return Cond; 17259 } 17260 } 17261 } 17262 } 17263 17264 // Handle these cases: 17265 // (select (x != c), e, c) -> select (x != c), e, x), 17266 // (select (x == c), c, e) -> select (x == c), x, e) 17267 // where the c is an integer constant, and the "select" is the combination 17268 // of CMOV and CMP. 17269 // 17270 // The rationale for this change is that the conditional-move from a constant 17271 // needs two instructions, however, conditional-move from a register needs 17272 // only one instruction. 17273 // 17274 // CAVEAT: By replacing a constant with a symbolic value, it may obscure 17275 // some instruction-combining opportunities. This opt needs to be 17276 // postponed as late as possible. 17277 // 17278 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { 17279 // the DCI.xxxx conditions are provided to postpone the optimization as 17280 // late as possible. 17281 17282 ConstantSDNode *CmpAgainst = 0; 17283 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && 17284 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && 17285 !isa<ConstantSDNode>(Cond.getOperand(0))) { 17286 17287 if (CC == X86::COND_NE && 17288 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { 17289 CC = X86::GetOppositeBranchCondition(CC); 17290 std::swap(TrueOp, FalseOp); 17291 } 17292 17293 if (CC == X86::COND_E && 17294 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { 17295 SDValue Ops[] = { FalseOp, Cond.getOperand(0), 17296 DAG.getConstant(CC, MVT::i8), Cond }; 17297 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, 17298 array_lengthof(Ops)); 17299 } 17300 } 17301 } 17302 17303 return SDValue(); 17304} 17305 17306/// PerformMulCombine - Optimize a single multiply with constant into two 17307/// in order to implement it with two cheaper instructions, e.g. 17308/// LEA + SHL, LEA + LEA. 17309static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 17310 TargetLowering::DAGCombinerInfo &DCI) { 17311 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 17312 return SDValue(); 17313 17314 EVT VT = N->getValueType(0); 17315 if (VT != MVT::i64) 17316 return SDValue(); 17317 17318 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 17319 if (!C) 17320 return SDValue(); 17321 uint64_t MulAmt = C->getZExtValue(); 17322 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 17323 return SDValue(); 17324 17325 uint64_t MulAmt1 = 0; 17326 uint64_t MulAmt2 = 0; 17327 if ((MulAmt % 9) == 0) { 17328 MulAmt1 = 9; 17329 MulAmt2 = MulAmt / 9; 17330 } else if ((MulAmt % 5) == 0) { 17331 MulAmt1 = 5; 17332 MulAmt2 = MulAmt / 5; 17333 } else if ((MulAmt % 3) == 0) { 17334 MulAmt1 = 3; 17335 MulAmt2 = MulAmt / 3; 17336 } 17337 if (MulAmt2 && 17338 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 17339 SDLoc DL(N); 17340 17341 if (isPowerOf2_64(MulAmt2) && 17342 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 17343 // If second multiplifer is pow2, issue it first. We want the multiply by 17344 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 17345 // is an add. 17346 std::swap(MulAmt1, MulAmt2); 17347 17348 SDValue NewMul; 17349 if (isPowerOf2_64(MulAmt1)) 17350 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 17351 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 17352 else 17353 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 17354 DAG.getConstant(MulAmt1, VT)); 17355 17356 if (isPowerOf2_64(MulAmt2)) 17357 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 17358 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 17359 else 17360 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 17361 DAG.getConstant(MulAmt2, VT)); 17362 17363 // Do not add new nodes to DAG combiner worklist. 17364 DCI.CombineTo(N, NewMul, false); 17365 } 17366 return SDValue(); 17367} 17368 17369static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 17370 SDValue N0 = N->getOperand(0); 17371 SDValue N1 = N->getOperand(1); 17372 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 17373 EVT VT = N0.getValueType(); 17374 17375 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 17376 // since the result of setcc_c is all zero's or all ones. 17377 if (VT.isInteger() && !VT.isVector() && 17378 N1C && N0.getOpcode() == ISD::AND && 17379 N0.getOperand(1).getOpcode() == ISD::Constant) { 17380 SDValue N00 = N0.getOperand(0); 17381 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 17382 ((N00.getOpcode() == ISD::ANY_EXTEND || 17383 N00.getOpcode() == ISD::ZERO_EXTEND) && 17384 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 17385 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 17386 APInt ShAmt = N1C->getAPIntValue(); 17387 Mask = Mask.shl(ShAmt); 17388 if (Mask != 0) 17389 return DAG.getNode(ISD::AND, SDLoc(N), VT, 17390 N00, DAG.getConstant(Mask, VT)); 17391 } 17392 } 17393 17394 // Hardware support for vector shifts is sparse which makes us scalarize the 17395 // vector operations in many cases. Also, on sandybridge ADD is faster than 17396 // shl. 17397 // (shl V, 1) -> add V,V 17398 if (isSplatVector(N1.getNode())) { 17399 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 17400 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); 17401 // We shift all of the values by one. In many cases we do not have 17402 // hardware support for this operation. This is better expressed as an ADD 17403 // of two values. 17404 if (N1C && (1 == N1C->getZExtValue())) { 17405 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); 17406 } 17407 } 17408 17409 return SDValue(); 17410} 17411 17412/// \brief Returns a vector of 0s if the node in input is a vector logical 17413/// shift by a constant amount which is known to be bigger than or equal 17414/// to the vector element size in bits. 17415static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, 17416 const X86Subtarget *Subtarget) { 17417 EVT VT = N->getValueType(0); 17418 17419 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 17420 (!Subtarget->hasInt256() || 17421 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 17422 return SDValue(); 17423 17424 SDValue Amt = N->getOperand(1); 17425 SDLoc DL(N); 17426 if (isSplatVector(Amt.getNode())) { 17427 SDValue SclrAmt = Amt->getOperand(0); 17428 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 17429 APInt ShiftAmt = C->getAPIntValue(); 17430 unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); 17431 17432 // SSE2/AVX2 logical shifts always return a vector of 0s 17433 // if the shift amount is bigger than or equal to 17434 // the element size. The constant shift amount will be 17435 // encoded as a 8-bit immediate. 17436 if (ShiftAmt.trunc(8).uge(MaxAmount)) 17437 return getZeroVector(VT, Subtarget, DAG, DL); 17438 } 17439 } 17440 17441 return SDValue(); 17442} 17443 17444/// PerformShiftCombine - Combine shifts. 17445static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 17446 TargetLowering::DAGCombinerInfo &DCI, 17447 const X86Subtarget *Subtarget) { 17448 if (N->getOpcode() == ISD::SHL) { 17449 SDValue V = PerformSHLCombine(N, DAG); 17450 if (V.getNode()) return V; 17451 } 17452 17453 if (N->getOpcode() != ISD::SRA) { 17454 // Try to fold this logical shift into a zero vector. 17455 SDValue V = performShiftToAllZeros(N, DAG, Subtarget); 17456 if (V.getNode()) return V; 17457 } 17458 17459 return SDValue(); 17460} 17461 17462// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 17463// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 17464// and friends. Likewise for OR -> CMPNEQSS. 17465static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 17466 TargetLowering::DAGCombinerInfo &DCI, 17467 const X86Subtarget *Subtarget) { 17468 unsigned opcode; 17469 17470 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 17471 // we're requiring SSE2 for both. 17472 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 17473 SDValue N0 = N->getOperand(0); 17474 SDValue N1 = N->getOperand(1); 17475 SDValue CMP0 = N0->getOperand(1); 17476 SDValue CMP1 = N1->getOperand(1); 17477 SDLoc DL(N); 17478 17479 // The SETCCs should both refer to the same CMP. 17480 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 17481 return SDValue(); 17482 17483 SDValue CMP00 = CMP0->getOperand(0); 17484 SDValue CMP01 = CMP0->getOperand(1); 17485 EVT VT = CMP00.getValueType(); 17486 17487 if (VT == MVT::f32 || VT == MVT::f64) { 17488 bool ExpectingFlags = false; 17489 // Check for any users that want flags: 17490 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 17491 !ExpectingFlags && UI != UE; ++UI) 17492 switch (UI->getOpcode()) { 17493 default: 17494 case ISD::BR_CC: 17495 case ISD::BRCOND: 17496 case ISD::SELECT: 17497 ExpectingFlags = true; 17498 break; 17499 case ISD::CopyToReg: 17500 case ISD::SIGN_EXTEND: 17501 case ISD::ZERO_EXTEND: 17502 case ISD::ANY_EXTEND: 17503 break; 17504 } 17505 17506 if (!ExpectingFlags) { 17507 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 17508 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 17509 17510 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 17511 X86::CondCode tmp = cc0; 17512 cc0 = cc1; 17513 cc1 = tmp; 17514 } 17515 17516 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 17517 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 17518 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 17519 X86ISD::NodeType NTOperator = is64BitFP ? 17520 X86ISD::FSETCCsd : X86ISD::FSETCCss; 17521 // FIXME: need symbolic constants for these magic numbers. 17522 // See X86ATTInstPrinter.cpp:printSSECC(). 17523 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 17524 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 17525 DAG.getConstant(x86cc, MVT::i8)); 17526 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 17527 OnesOrZeroesF); 17528 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 17529 DAG.getConstant(1, MVT::i32)); 17530 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 17531 return OneBitOfTruth; 17532 } 17533 } 17534 } 17535 } 17536 return SDValue(); 17537} 17538 17539/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 17540/// so it can be folded inside ANDNP. 17541static bool CanFoldXORWithAllOnes(const SDNode *N) { 17542 EVT VT = N->getValueType(0); 17543 17544 // Match direct AllOnes for 128 and 256-bit vectors 17545 if (ISD::isBuildVectorAllOnes(N)) 17546 return true; 17547 17548 // Look through a bit convert. 17549 if (N->getOpcode() == ISD::BITCAST) 17550 N = N->getOperand(0).getNode(); 17551 17552 // Sometimes the operand may come from a insert_subvector building a 256-bit 17553 // allones vector 17554 if (VT.is256BitVector() && 17555 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 17556 SDValue V1 = N->getOperand(0); 17557 SDValue V2 = N->getOperand(1); 17558 17559 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 17560 V1.getOperand(0).getOpcode() == ISD::UNDEF && 17561 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 17562 ISD::isBuildVectorAllOnes(V2.getNode())) 17563 return true; 17564 } 17565 17566 return false; 17567} 17568 17569// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized 17570// register. In most cases we actually compare or select YMM-sized registers 17571// and mixing the two types creates horrible code. This method optimizes 17572// some of the transition sequences. 17573static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, 17574 TargetLowering::DAGCombinerInfo &DCI, 17575 const X86Subtarget *Subtarget) { 17576 EVT VT = N->getValueType(0); 17577 if (!VT.is256BitVector()) 17578 return SDValue(); 17579 17580 assert((N->getOpcode() == ISD::ANY_EXTEND || 17581 N->getOpcode() == ISD::ZERO_EXTEND || 17582 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); 17583 17584 SDValue Narrow = N->getOperand(0); 17585 EVT NarrowVT = Narrow->getValueType(0); 17586 if (!NarrowVT.is128BitVector()) 17587 return SDValue(); 17588 17589 if (Narrow->getOpcode() != ISD::XOR && 17590 Narrow->getOpcode() != ISD::AND && 17591 Narrow->getOpcode() != ISD::OR) 17592 return SDValue(); 17593 17594 SDValue N0 = Narrow->getOperand(0); 17595 SDValue N1 = Narrow->getOperand(1); 17596 SDLoc DL(Narrow); 17597 17598 // The Left side has to be a trunc. 17599 if (N0.getOpcode() != ISD::TRUNCATE) 17600 return SDValue(); 17601 17602 // The type of the truncated inputs. 17603 EVT WideVT = N0->getOperand(0)->getValueType(0); 17604 if (WideVT != VT) 17605 return SDValue(); 17606 17607 // The right side has to be a 'trunc' or a constant vector. 17608 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; 17609 bool RHSConst = (isSplatVector(N1.getNode()) && 17610 isa<ConstantSDNode>(N1->getOperand(0))); 17611 if (!RHSTrunc && !RHSConst) 17612 return SDValue(); 17613 17614 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17615 17616 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) 17617 return SDValue(); 17618 17619 // Set N0 and N1 to hold the inputs to the new wide operation. 17620 N0 = N0->getOperand(0); 17621 if (RHSConst) { 17622 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), 17623 N1->getOperand(0)); 17624 SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); 17625 N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); 17626 } else if (RHSTrunc) { 17627 N1 = N1->getOperand(0); 17628 } 17629 17630 // Generate the wide operation. 17631 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); 17632 unsigned Opcode = N->getOpcode(); 17633 switch (Opcode) { 17634 case ISD::ANY_EXTEND: 17635 return Op; 17636 case ISD::ZERO_EXTEND: { 17637 unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); 17638 APInt Mask = APInt::getAllOnesValue(InBits); 17639 Mask = Mask.zext(VT.getScalarType().getSizeInBits()); 17640 return DAG.getNode(ISD::AND, DL, VT, 17641 Op, DAG.getConstant(Mask, VT)); 17642 } 17643 case ISD::SIGN_EXTEND: 17644 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, 17645 Op, DAG.getValueType(NarrowVT)); 17646 default: 17647 llvm_unreachable("Unexpected opcode"); 17648 } 17649} 17650 17651static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 17652 TargetLowering::DAGCombinerInfo &DCI, 17653 const X86Subtarget *Subtarget) { 17654 EVT VT = N->getValueType(0); 17655 if (DCI.isBeforeLegalizeOps()) 17656 return SDValue(); 17657 17658 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 17659 if (R.getNode()) 17660 return R; 17661 17662 // Create BLSI, BLSR, and BZHI instructions 17663 // BLSI is X & (-X) 17664 // BLSR is X & (X-1) 17665 // BZHI is X & ((1 << Y) - 1) 17666 // BEXTR is ((X >> imm) & (2**size-1)) 17667 if (VT == MVT::i32 || VT == MVT::i64) { 17668 SDValue N0 = N->getOperand(0); 17669 SDValue N1 = N->getOperand(1); 17670 SDLoc DL(N); 17671 17672 if (Subtarget->hasBMI()) { 17673 // Check LHS for neg 17674 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && 17675 isZero(N0.getOperand(0))) 17676 return DAG.getNode(X86ISD::BLSI, DL, VT, N1); 17677 17678 // Check RHS for neg 17679 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && 17680 isZero(N1.getOperand(0))) 17681 return DAG.getNode(X86ISD::BLSI, DL, VT, N0); 17682 17683 // Check LHS for X-1 17684 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 17685 isAllOnes(N0.getOperand(1))) 17686 return DAG.getNode(X86ISD::BLSR, DL, VT, N1); 17687 17688 // Check RHS for X-1 17689 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 17690 isAllOnes(N1.getOperand(1))) 17691 return DAG.getNode(X86ISD::BLSR, DL, VT, N0); 17692 } 17693 17694 if (Subtarget->hasBMI2()) { 17695 // Check for (and (add (shl 1, Y), -1), X) 17696 if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) { 17697 SDValue N00 = N0.getOperand(0); 17698 if (N00.getOpcode() == ISD::SHL) { 17699 SDValue N001 = N00.getOperand(1); 17700 assert(N001.getValueType() == MVT::i8 && "unexpected type"); 17701 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0)); 17702 if (C && C->getZExtValue() == 1) 17703 return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001); 17704 } 17705 } 17706 17707 // Check for (and X, (add (shl 1, Y), -1)) 17708 if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) { 17709 SDValue N10 = N1.getOperand(0); 17710 if (N10.getOpcode() == ISD::SHL) { 17711 SDValue N101 = N10.getOperand(1); 17712 assert(N101.getValueType() == MVT::i8 && "unexpected type"); 17713 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0)); 17714 if (C && C->getZExtValue() == 1) 17715 return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101); 17716 } 17717 } 17718 } 17719 17720 // Check for BEXTR. 17721 if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && 17722 (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { 17723 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); 17724 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 17725 if (MaskNode && ShiftNode) { 17726 uint64_t Mask = MaskNode->getZExtValue(); 17727 uint64_t Shift = ShiftNode->getZExtValue(); 17728 if (isMask_64(Mask)) { 17729 uint64_t MaskSize = CountPopulation_64(Mask); 17730 if (Shift + MaskSize <= VT.getSizeInBits()) 17731 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), 17732 DAG.getConstant(Shift | (MaskSize << 8), VT)); 17733 } 17734 } 17735 } // BEXTR 17736 17737 return SDValue(); 17738 } 17739 17740 // Want to form ANDNP nodes: 17741 // 1) In the hopes of then easily combining them with OR and AND nodes 17742 // to form PBLEND/PSIGN. 17743 // 2) To match ANDN packed intrinsics 17744 if (VT != MVT::v2i64 && VT != MVT::v4i64) 17745 return SDValue(); 17746 17747 SDValue N0 = N->getOperand(0); 17748 SDValue N1 = N->getOperand(1); 17749 SDLoc DL(N); 17750 17751 // Check LHS for vnot 17752 if (N0.getOpcode() == ISD::XOR && 17753 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 17754 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 17755 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 17756 17757 // Check RHS for vnot 17758 if (N1.getOpcode() == ISD::XOR && 17759 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 17760 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 17761 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 17762 17763 return SDValue(); 17764} 17765 17766static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 17767 TargetLowering::DAGCombinerInfo &DCI, 17768 const X86Subtarget *Subtarget) { 17769 EVT VT = N->getValueType(0); 17770 if (DCI.isBeforeLegalizeOps()) 17771 return SDValue(); 17772 17773 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 17774 if (R.getNode()) 17775 return R; 17776 17777 SDValue N0 = N->getOperand(0); 17778 SDValue N1 = N->getOperand(1); 17779 17780 // look for psign/blend 17781 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 17782 if (!Subtarget->hasSSSE3() || 17783 (VT == MVT::v4i64 && !Subtarget->hasInt256())) 17784 return SDValue(); 17785 17786 // Canonicalize pandn to RHS 17787 if (N0.getOpcode() == X86ISD::ANDNP) 17788 std::swap(N0, N1); 17789 // or (and (m, y), (pandn m, x)) 17790 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 17791 SDValue Mask = N1.getOperand(0); 17792 SDValue X = N1.getOperand(1); 17793 SDValue Y; 17794 if (N0.getOperand(0) == Mask) 17795 Y = N0.getOperand(1); 17796 if (N0.getOperand(1) == Mask) 17797 Y = N0.getOperand(0); 17798 17799 // Check to see if the mask appeared in both the AND and ANDNP and 17800 if (!Y.getNode()) 17801 return SDValue(); 17802 17803 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 17804 // Look through mask bitcast. 17805 if (Mask.getOpcode() == ISD::BITCAST) 17806 Mask = Mask.getOperand(0); 17807 if (X.getOpcode() == ISD::BITCAST) 17808 X = X.getOperand(0); 17809 if (Y.getOpcode() == ISD::BITCAST) 17810 Y = Y.getOperand(0); 17811 17812 EVT MaskVT = Mask.getValueType(); 17813 17814 // Validate that the Mask operand is a vector sra node. 17815 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 17816 // there is no psrai.b 17817 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 17818 unsigned SraAmt = ~0; 17819 if (Mask.getOpcode() == ISD::SRA) { 17820 SDValue Amt = Mask.getOperand(1); 17821 if (isSplatVector(Amt.getNode())) { 17822 SDValue SclrAmt = Amt->getOperand(0); 17823 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) 17824 SraAmt = C->getZExtValue(); 17825 } 17826 } else if (Mask.getOpcode() == X86ISD::VSRAI) { 17827 SDValue SraC = Mask.getOperand(1); 17828 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 17829 } 17830 if ((SraAmt + 1) != EltBits) 17831 return SDValue(); 17832 17833 SDLoc DL(N); 17834 17835 // Now we know we at least have a plendvb with the mask val. See if 17836 // we can form a psignb/w/d. 17837 // psign = x.type == y.type == mask.type && y = sub(0, x); 17838 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 17839 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 17840 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 17841 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 17842 "Unsupported VT for PSIGN"); 17843 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); 17844 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 17845 } 17846 // PBLENDVB only available on SSE 4.1 17847 if (!Subtarget->hasSSE41()) 17848 return SDValue(); 17849 17850 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 17851 17852 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); 17853 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); 17854 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); 17855 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 17856 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 17857 } 17858 } 17859 17860 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 17861 return SDValue(); 17862 17863 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 17864 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 17865 std::swap(N0, N1); 17866 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 17867 return SDValue(); 17868 if (!N0.hasOneUse() || !N1.hasOneUse()) 17869 return SDValue(); 17870 17871 SDValue ShAmt0 = N0.getOperand(1); 17872 if (ShAmt0.getValueType() != MVT::i8) 17873 return SDValue(); 17874 SDValue ShAmt1 = N1.getOperand(1); 17875 if (ShAmt1.getValueType() != MVT::i8) 17876 return SDValue(); 17877 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 17878 ShAmt0 = ShAmt0.getOperand(0); 17879 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 17880 ShAmt1 = ShAmt1.getOperand(0); 17881 17882 SDLoc DL(N); 17883 unsigned Opc = X86ISD::SHLD; 17884 SDValue Op0 = N0.getOperand(0); 17885 SDValue Op1 = N1.getOperand(0); 17886 if (ShAmt0.getOpcode() == ISD::SUB) { 17887 Opc = X86ISD::SHRD; 17888 std::swap(Op0, Op1); 17889 std::swap(ShAmt0, ShAmt1); 17890 } 17891 17892 unsigned Bits = VT.getSizeInBits(); 17893 if (ShAmt1.getOpcode() == ISD::SUB) { 17894 SDValue Sum = ShAmt1.getOperand(0); 17895 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 17896 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 17897 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 17898 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 17899 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 17900 return DAG.getNode(Opc, DL, VT, 17901 Op0, Op1, 17902 DAG.getNode(ISD::TRUNCATE, DL, 17903 MVT::i8, ShAmt0)); 17904 } 17905 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 17906 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 17907 if (ShAmt0C && 17908 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 17909 return DAG.getNode(Opc, DL, VT, 17910 N0.getOperand(0), N1.getOperand(0), 17911 DAG.getNode(ISD::TRUNCATE, DL, 17912 MVT::i8, ShAmt0)); 17913 } 17914 17915 return SDValue(); 17916} 17917 17918// Generate NEG and CMOV for integer abs. 17919static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 17920 EVT VT = N->getValueType(0); 17921 17922 // Since X86 does not have CMOV for 8-bit integer, we don't convert 17923 // 8-bit integer abs to NEG and CMOV. 17924 if (VT.isInteger() && VT.getSizeInBits() == 8) 17925 return SDValue(); 17926 17927 SDValue N0 = N->getOperand(0); 17928 SDValue N1 = N->getOperand(1); 17929 SDLoc DL(N); 17930 17931 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 17932 // and change it to SUB and CMOV. 17933 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 17934 N0.getOpcode() == ISD::ADD && 17935 N0.getOperand(1) == N1 && 17936 N1.getOpcode() == ISD::SRA && 17937 N1.getOperand(0) == N0.getOperand(0)) 17938 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 17939 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { 17940 // Generate SUB & CMOV. 17941 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), 17942 DAG.getConstant(0, VT), N0.getOperand(0)); 17943 17944 SDValue Ops[] = { N0.getOperand(0), Neg, 17945 DAG.getConstant(X86::COND_GE, MVT::i8), 17946 SDValue(Neg.getNode(), 1) }; 17947 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), 17948 Ops, array_lengthof(Ops)); 17949 } 17950 return SDValue(); 17951} 17952 17953// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes 17954static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 17955 TargetLowering::DAGCombinerInfo &DCI, 17956 const X86Subtarget *Subtarget) { 17957 EVT VT = N->getValueType(0); 17958 if (DCI.isBeforeLegalizeOps()) 17959 return SDValue(); 17960 17961 if (Subtarget->hasCMov()) { 17962 SDValue RV = performIntegerAbsCombine(N, DAG); 17963 if (RV.getNode()) 17964 return RV; 17965 } 17966 17967 // Try forming BMI if it is available. 17968 if (!Subtarget->hasBMI()) 17969 return SDValue(); 17970 17971 if (VT != MVT::i32 && VT != MVT::i64) 17972 return SDValue(); 17973 17974 assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); 17975 17976 // Create BLSMSK instructions by finding X ^ (X-1) 17977 SDValue N0 = N->getOperand(0); 17978 SDValue N1 = N->getOperand(1); 17979 SDLoc DL(N); 17980 17981 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 17982 isAllOnes(N0.getOperand(1))) 17983 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); 17984 17985 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 17986 isAllOnes(N1.getOperand(1))) 17987 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); 17988 17989 return SDValue(); 17990} 17991 17992/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 17993static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 17994 TargetLowering::DAGCombinerInfo &DCI, 17995 const X86Subtarget *Subtarget) { 17996 LoadSDNode *Ld = cast<LoadSDNode>(N); 17997 EVT RegVT = Ld->getValueType(0); 17998 EVT MemVT = Ld->getMemoryVT(); 17999 SDLoc dl(Ld); 18000 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18001 unsigned RegSz = RegVT.getSizeInBits(); 18002 18003 // On Sandybridge unaligned 256bit loads are inefficient. 18004 ISD::LoadExtType Ext = Ld->getExtensionType(); 18005 unsigned Alignment = Ld->getAlignment(); 18006 bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; 18007 if (RegVT.is256BitVector() && !Subtarget->hasInt256() && 18008 !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { 18009 unsigned NumElems = RegVT.getVectorNumElements(); 18010 if (NumElems < 2) 18011 return SDValue(); 18012 18013 SDValue Ptr = Ld->getBasePtr(); 18014 SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); 18015 18016 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 18017 NumElems/2); 18018 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 18019 Ld->getPointerInfo(), Ld->isVolatile(), 18020 Ld->isNonTemporal(), Ld->isInvariant(), 18021 Alignment); 18022 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 18023 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 18024 Ld->getPointerInfo(), Ld->isVolatile(), 18025 Ld->isNonTemporal(), Ld->isInvariant(), 18026 std::min(16U, Alignment)); 18027 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 18028 Load1.getValue(1), 18029 Load2.getValue(1)); 18030 18031 SDValue NewVec = DAG.getUNDEF(RegVT); 18032 NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); 18033 NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); 18034 return DCI.CombineTo(N, NewVec, TF, true); 18035 } 18036 18037 // If this is a vector EXT Load then attempt to optimize it using a 18038 // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the 18039 // expansion is still better than scalar code. 18040 // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll 18041 // emit a shuffle and a arithmetic shift. 18042 // TODO: It is possible to support ZExt by zeroing the undef values 18043 // during the shuffle phase or after the shuffle. 18044 if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() && 18045 (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) { 18046 assert(MemVT != RegVT && "Cannot extend to the same type"); 18047 assert(MemVT.isVector() && "Must load a vector from memory"); 18048 18049 unsigned NumElems = RegVT.getVectorNumElements(); 18050 unsigned MemSz = MemVT.getSizeInBits(); 18051 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 18052 18053 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) 18054 return SDValue(); 18055 18056 // All sizes must be a power of two. 18057 if (!isPowerOf2_32(RegSz * MemSz * NumElems)) 18058 return SDValue(); 18059 18060 // Attempt to load the original value using scalar loads. 18061 // Find the largest scalar type that divides the total loaded size. 18062 MVT SclrLoadTy = MVT::i8; 18063 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 18064 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 18065 MVT Tp = (MVT::SimpleValueType)tp; 18066 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { 18067 SclrLoadTy = Tp; 18068 } 18069 } 18070 18071 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 18072 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && 18073 (64 <= MemSz)) 18074 SclrLoadTy = MVT::f64; 18075 18076 // Calculate the number of scalar loads that we need to perform 18077 // in order to load our vector from memory. 18078 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); 18079 if (Ext == ISD::SEXTLOAD && NumLoads > 1) 18080 return SDValue(); 18081 18082 unsigned loadRegZize = RegSz; 18083 if (Ext == ISD::SEXTLOAD && RegSz == 256) 18084 loadRegZize /= 2; 18085 18086 // Represent our vector as a sequence of elements which are the 18087 // largest scalar that we can load. 18088 EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, 18089 loadRegZize/SclrLoadTy.getSizeInBits()); 18090 18091 // Represent the data using the same element type that is stored in 18092 // memory. In practice, we ''widen'' MemVT. 18093 EVT WideVecVT = 18094 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 18095 loadRegZize/MemVT.getScalarType().getSizeInBits()); 18096 18097 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && 18098 "Invalid vector type"); 18099 18100 // We can't shuffle using an illegal type. 18101 if (!TLI.isTypeLegal(WideVecVT)) 18102 return SDValue(); 18103 18104 SmallVector<SDValue, 8> Chains; 18105 SDValue Ptr = Ld->getBasePtr(); 18106 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, 18107 TLI.getPointerTy()); 18108 SDValue Res = DAG.getUNDEF(LoadUnitVecVT); 18109 18110 for (unsigned i = 0; i < NumLoads; ++i) { 18111 // Perform a single load. 18112 SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), 18113 Ptr, Ld->getPointerInfo(), 18114 Ld->isVolatile(), Ld->isNonTemporal(), 18115 Ld->isInvariant(), Ld->getAlignment()); 18116 Chains.push_back(ScalarLoad.getValue(1)); 18117 // Create the first element type using SCALAR_TO_VECTOR in order to avoid 18118 // another round of DAGCombining. 18119 if (i == 0) 18120 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); 18121 else 18122 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, 18123 ScalarLoad, DAG.getIntPtrConstant(i)); 18124 18125 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 18126 } 18127 18128 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 18129 Chains.size()); 18130 18131 // Bitcast the loaded value to a vector of the original element type, in 18132 // the size of the target vector type. 18133 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); 18134 unsigned SizeRatio = RegSz/MemSz; 18135 18136 if (Ext == ISD::SEXTLOAD) { 18137 // If we have SSE4.1 we can directly emit a VSEXT node. 18138 if (Subtarget->hasSSE41()) { 18139 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); 18140 return DCI.CombineTo(N, Sext, TF, true); 18141 } 18142 18143 // Otherwise we'll shuffle the small elements in the high bits of the 18144 // larger type and perform an arithmetic shift. If the shift is not legal 18145 // it's better to scalarize. 18146 if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT)) 18147 return SDValue(); 18148 18149 // Redistribute the loaded elements into the different locations. 18150 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 18151 for (unsigned i = 0; i != NumElems; ++i) 18152 ShuffleVec[i*SizeRatio + SizeRatio-1] = i; 18153 18154 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 18155 DAG.getUNDEF(WideVecVT), 18156 &ShuffleVec[0]); 18157 18158 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 18159 18160 // Build the arithmetic shift. 18161 unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - 18162 MemVT.getVectorElementType().getSizeInBits(); 18163 Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, 18164 DAG.getConstant(Amt, RegVT)); 18165 18166 return DCI.CombineTo(N, Shuff, TF, true); 18167 } 18168 18169 // Redistribute the loaded elements into the different locations. 18170 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 18171 for (unsigned i = 0; i != NumElems; ++i) 18172 ShuffleVec[i*SizeRatio] = i; 18173 18174 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 18175 DAG.getUNDEF(WideVecVT), 18176 &ShuffleVec[0]); 18177 18178 // Bitcast to the requested type. 18179 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 18180 // Replace the original load with the new sequence 18181 // and return the new chain. 18182 return DCI.CombineTo(N, Shuff, TF, true); 18183 } 18184 18185 return SDValue(); 18186} 18187 18188/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 18189static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 18190 const X86Subtarget *Subtarget) { 18191 StoreSDNode *St = cast<StoreSDNode>(N); 18192 EVT VT = St->getValue().getValueType(); 18193 EVT StVT = St->getMemoryVT(); 18194 SDLoc dl(St); 18195 SDValue StoredVal = St->getOperand(1); 18196 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18197 18198 // If we are saving a concatenation of two XMM registers, perform two stores. 18199 // On Sandy Bridge, 256-bit memory operations are executed by two 18200 // 128-bit ports. However, on Haswell it is better to issue a single 256-bit 18201 // memory operation. 18202 unsigned Alignment = St->getAlignment(); 18203 bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; 18204 if (VT.is256BitVector() && !Subtarget->hasInt256() && 18205 StVT == VT && !IsAligned) { 18206 unsigned NumElems = VT.getVectorNumElements(); 18207 if (NumElems < 2) 18208 return SDValue(); 18209 18210 SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); 18211 SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); 18212 18213 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 18214 SDValue Ptr0 = St->getBasePtr(); 18215 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 18216 18217 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 18218 St->getPointerInfo(), St->isVolatile(), 18219 St->isNonTemporal(), Alignment); 18220 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 18221 St->getPointerInfo(), St->isVolatile(), 18222 St->isNonTemporal(), 18223 std::min(16U, Alignment)); 18224 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 18225 } 18226 18227 // Optimize trunc store (of multiple scalars) to shuffle and store. 18228 // First, pack all of the elements in one place. Next, store to memory 18229 // in fewer chunks. 18230 if (St->isTruncatingStore() && VT.isVector()) { 18231 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18232 unsigned NumElems = VT.getVectorNumElements(); 18233 assert(StVT != VT && "Cannot truncate to the same type"); 18234 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 18235 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 18236 18237 // From, To sizes and ElemCount must be pow of two 18238 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 18239 // We are going to use the original vector elt for storing. 18240 // Accumulated smaller vector elements must be a multiple of the store size. 18241 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 18242 18243 unsigned SizeRatio = FromSz / ToSz; 18244 18245 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 18246 18247 // Create a type on which we perform the shuffle 18248 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 18249 StVT.getScalarType(), NumElems*SizeRatio); 18250 18251 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 18252 18253 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 18254 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 18255 for (unsigned i = 0; i != NumElems; ++i) 18256 ShuffleVec[i] = i * SizeRatio; 18257 18258 // Can't shuffle using an illegal type. 18259 if (!TLI.isTypeLegal(WideVecVT)) 18260 return SDValue(); 18261 18262 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 18263 DAG.getUNDEF(WideVecVT), 18264 &ShuffleVec[0]); 18265 // At this point all of the data is stored at the bottom of the 18266 // register. We now need to save it to mem. 18267 18268 // Find the largest store unit 18269 MVT StoreType = MVT::i8; 18270 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 18271 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 18272 MVT Tp = (MVT::SimpleValueType)tp; 18273 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) 18274 StoreType = Tp; 18275 } 18276 18277 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 18278 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && 18279 (64 <= NumElems * ToSz)) 18280 StoreType = MVT::f64; 18281 18282 // Bitcast the original vector into a vector of store-size units 18283 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 18284 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); 18285 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 18286 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 18287 SmallVector<SDValue, 8> Chains; 18288 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 18289 TLI.getPointerTy()); 18290 SDValue Ptr = St->getBasePtr(); 18291 18292 // Perform one or more big stores into memory. 18293 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { 18294 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 18295 StoreType, ShuffWide, 18296 DAG.getIntPtrConstant(i)); 18297 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 18298 St->getPointerInfo(), St->isVolatile(), 18299 St->isNonTemporal(), St->getAlignment()); 18300 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 18301 Chains.push_back(Ch); 18302 } 18303 18304 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 18305 Chains.size()); 18306 } 18307 18308 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 18309 // the FP state in cases where an emms may be missing. 18310 // A preferable solution to the general problem is to figure out the right 18311 // places to insert EMMS. This qualifies as a quick hack. 18312 18313 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 18314 if (VT.getSizeInBits() != 64) 18315 return SDValue(); 18316 18317 const Function *F = DAG.getMachineFunction().getFunction(); 18318 bool NoImplicitFloatOps = F->getAttributes(). 18319 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); 18320 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps 18321 && Subtarget->hasSSE2(); 18322 if ((VT.isVector() || 18323 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 18324 isa<LoadSDNode>(St->getValue()) && 18325 !cast<LoadSDNode>(St->getValue())->isVolatile() && 18326 St->getChain().hasOneUse() && !St->isVolatile()) { 18327 SDNode* LdVal = St->getValue().getNode(); 18328 LoadSDNode *Ld = 0; 18329 int TokenFactorIndex = -1; 18330 SmallVector<SDValue, 8> Ops; 18331 SDNode* ChainVal = St->getChain().getNode(); 18332 // Must be a store of a load. We currently handle two cases: the load 18333 // is a direct child, and it's under an intervening TokenFactor. It is 18334 // possible to dig deeper under nested TokenFactors. 18335 if (ChainVal == LdVal) 18336 Ld = cast<LoadSDNode>(St->getChain()); 18337 else if (St->getValue().hasOneUse() && 18338 ChainVal->getOpcode() == ISD::TokenFactor) { 18339 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 18340 if (ChainVal->getOperand(i).getNode() == LdVal) { 18341 TokenFactorIndex = i; 18342 Ld = cast<LoadSDNode>(St->getValue()); 18343 } else 18344 Ops.push_back(ChainVal->getOperand(i)); 18345 } 18346 } 18347 18348 if (!Ld || !ISD::isNormalLoad(Ld)) 18349 return SDValue(); 18350 18351 // If this is not the MMX case, i.e. we are just turning i64 load/store 18352 // into f64 load/store, avoid the transformation if there are multiple 18353 // uses of the loaded value. 18354 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 18355 return SDValue(); 18356 18357 SDLoc LdDL(Ld); 18358 SDLoc StDL(N); 18359 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 18360 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 18361 // pair instead. 18362 if (Subtarget->is64Bit() || F64IsLegal) { 18363 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 18364 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 18365 Ld->getPointerInfo(), Ld->isVolatile(), 18366 Ld->isNonTemporal(), Ld->isInvariant(), 18367 Ld->getAlignment()); 18368 SDValue NewChain = NewLd.getValue(1); 18369 if (TokenFactorIndex != -1) { 18370 Ops.push_back(NewChain); 18371 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 18372 Ops.size()); 18373 } 18374 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 18375 St->getPointerInfo(), 18376 St->isVolatile(), St->isNonTemporal(), 18377 St->getAlignment()); 18378 } 18379 18380 // Otherwise, lower to two pairs of 32-bit loads / stores. 18381 SDValue LoAddr = Ld->getBasePtr(); 18382 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 18383 DAG.getConstant(4, MVT::i32)); 18384 18385 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 18386 Ld->getPointerInfo(), 18387 Ld->isVolatile(), Ld->isNonTemporal(), 18388 Ld->isInvariant(), Ld->getAlignment()); 18389 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 18390 Ld->getPointerInfo().getWithOffset(4), 18391 Ld->isVolatile(), Ld->isNonTemporal(), 18392 Ld->isInvariant(), 18393 MinAlign(Ld->getAlignment(), 4)); 18394 18395 SDValue NewChain = LoLd.getValue(1); 18396 if (TokenFactorIndex != -1) { 18397 Ops.push_back(LoLd); 18398 Ops.push_back(HiLd); 18399 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 18400 Ops.size()); 18401 } 18402 18403 LoAddr = St->getBasePtr(); 18404 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 18405 DAG.getConstant(4, MVT::i32)); 18406 18407 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 18408 St->getPointerInfo(), 18409 St->isVolatile(), St->isNonTemporal(), 18410 St->getAlignment()); 18411 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 18412 St->getPointerInfo().getWithOffset(4), 18413 St->isVolatile(), 18414 St->isNonTemporal(), 18415 MinAlign(St->getAlignment(), 4)); 18416 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 18417 } 18418 return SDValue(); 18419} 18420 18421/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" 18422/// and return the operands for the horizontal operation in LHS and RHS. A 18423/// horizontal operation performs the binary operation on successive elements 18424/// of its first operand, then on successive elements of its second operand, 18425/// returning the resulting values in a vector. For example, if 18426/// A = < float a0, float a1, float a2, float a3 > 18427/// and 18428/// B = < float b0, float b1, float b2, float b3 > 18429/// then the result of doing a horizontal operation on A and B is 18430/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 18431/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 18432/// A horizontal-op B, for some already available A and B, and if so then LHS is 18433/// set to A, RHS to B, and the routine returns 'true'. 18434/// Note that the binary operation should have the property that if one of the 18435/// operands is UNDEF then the result is UNDEF. 18436static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 18437 // Look for the following pattern: if 18438 // A = < float a0, float a1, float a2, float a3 > 18439 // B = < float b0, float b1, float b2, float b3 > 18440 // and 18441 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 18442 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 18443 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 18444 // which is A horizontal-op B. 18445 18446 // At least one of the operands should be a vector shuffle. 18447 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 18448 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 18449 return false; 18450 18451 MVT VT = LHS.getSimpleValueType(); 18452 18453 assert((VT.is128BitVector() || VT.is256BitVector()) && 18454 "Unsupported vector type for horizontal add/sub"); 18455 18456 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 18457 // operate independently on 128-bit lanes. 18458 unsigned NumElts = VT.getVectorNumElements(); 18459 unsigned NumLanes = VT.getSizeInBits()/128; 18460 unsigned NumLaneElts = NumElts / NumLanes; 18461 assert((NumLaneElts % 2 == 0) && 18462 "Vector type should have an even number of elements in each lane"); 18463 unsigned HalfLaneElts = NumLaneElts/2; 18464 18465 // View LHS in the form 18466 // LHS = VECTOR_SHUFFLE A, B, LMask 18467 // If LHS is not a shuffle then pretend it is the shuffle 18468 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 18469 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 18470 // type VT. 18471 SDValue A, B; 18472 SmallVector<int, 16> LMask(NumElts); 18473 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 18474 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 18475 A = LHS.getOperand(0); 18476 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 18477 B = LHS.getOperand(1); 18478 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 18479 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 18480 } else { 18481 if (LHS.getOpcode() != ISD::UNDEF) 18482 A = LHS; 18483 for (unsigned i = 0; i != NumElts; ++i) 18484 LMask[i] = i; 18485 } 18486 18487 // Likewise, view RHS in the form 18488 // RHS = VECTOR_SHUFFLE C, D, RMask 18489 SDValue C, D; 18490 SmallVector<int, 16> RMask(NumElts); 18491 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 18492 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 18493 C = RHS.getOperand(0); 18494 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 18495 D = RHS.getOperand(1); 18496 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 18497 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 18498 } else { 18499 if (RHS.getOpcode() != ISD::UNDEF) 18500 C = RHS; 18501 for (unsigned i = 0; i != NumElts; ++i) 18502 RMask[i] = i; 18503 } 18504 18505 // Check that the shuffles are both shuffling the same vectors. 18506 if (!(A == C && B == D) && !(A == D && B == C)) 18507 return false; 18508 18509 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 18510 if (!A.getNode() && !B.getNode()) 18511 return false; 18512 18513 // If A and B occur in reverse order in RHS, then "swap" them (which means 18514 // rewriting the mask). 18515 if (A != C) 18516 CommuteVectorShuffleMask(RMask, NumElts); 18517 18518 // At this point LHS and RHS are equivalent to 18519 // LHS = VECTOR_SHUFFLE A, B, LMask 18520 // RHS = VECTOR_SHUFFLE A, B, RMask 18521 // Check that the masks correspond to performing a horizontal operation. 18522 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 18523 for (unsigned i = 0; i != NumLaneElts; ++i) { 18524 int LIdx = LMask[i+l], RIdx = RMask[i+l]; 18525 18526 // Ignore any UNDEF components. 18527 if (LIdx < 0 || RIdx < 0 || 18528 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 18529 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 18530 continue; 18531 18532 // Check that successive elements are being operated on. If not, this is 18533 // not a horizontal operation. 18534 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs 18535 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; 18536 if (!(LIdx == Index && RIdx == Index + 1) && 18537 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 18538 return false; 18539 } 18540 } 18541 18542 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 18543 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 18544 return true; 18545} 18546 18547/// PerformFADDCombine - Do target-specific dag combines on floating point adds. 18548static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 18549 const X86Subtarget *Subtarget) { 18550 EVT VT = N->getValueType(0); 18551 SDValue LHS = N->getOperand(0); 18552 SDValue RHS = N->getOperand(1); 18553 18554 // Try to synthesize horizontal adds from adds of shuffles. 18555 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 18556 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 18557 isHorizontalBinOp(LHS, RHS, true)) 18558 return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); 18559 return SDValue(); 18560} 18561 18562/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. 18563static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 18564 const X86Subtarget *Subtarget) { 18565 EVT VT = N->getValueType(0); 18566 SDValue LHS = N->getOperand(0); 18567 SDValue RHS = N->getOperand(1); 18568 18569 // Try to synthesize horizontal subs from subs of shuffles. 18570 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 18571 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 18572 isHorizontalBinOp(LHS, RHS, false)) 18573 return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); 18574 return SDValue(); 18575} 18576 18577/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 18578/// X86ISD::FXOR nodes. 18579static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 18580 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 18581 // F[X]OR(0.0, x) -> x 18582 // F[X]OR(x, 0.0) -> x 18583 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 18584 if (C->getValueAPF().isPosZero()) 18585 return N->getOperand(1); 18586 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 18587 if (C->getValueAPF().isPosZero()) 18588 return N->getOperand(0); 18589 return SDValue(); 18590} 18591 18592/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and 18593/// X86ISD::FMAX nodes. 18594static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { 18595 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); 18596 18597 // Only perform optimizations if UnsafeMath is used. 18598 if (!DAG.getTarget().Options.UnsafeFPMath) 18599 return SDValue(); 18600 18601 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes 18602 // into FMINC and FMAXC, which are Commutative operations. 18603 unsigned NewOp = 0; 18604 switch (N->getOpcode()) { 18605 default: llvm_unreachable("unknown opcode"); 18606 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; 18607 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; 18608 } 18609 18610 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), 18611 N->getOperand(0), N->getOperand(1)); 18612} 18613 18614/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 18615static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 18616 // FAND(0.0, x) -> 0.0 18617 // FAND(x, 0.0) -> 0.0 18618 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 18619 if (C->getValueAPF().isPosZero()) 18620 return N->getOperand(0); 18621 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 18622 if (C->getValueAPF().isPosZero()) 18623 return N->getOperand(1); 18624 return SDValue(); 18625} 18626 18627/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes 18628static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { 18629 // FANDN(x, 0.0) -> 0.0 18630 // FANDN(0.0, x) -> x 18631 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 18632 if (C->getValueAPF().isPosZero()) 18633 return N->getOperand(1); 18634 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 18635 if (C->getValueAPF().isPosZero()) 18636 return N->getOperand(1); 18637 return SDValue(); 18638} 18639 18640static SDValue PerformBTCombine(SDNode *N, 18641 SelectionDAG &DAG, 18642 TargetLowering::DAGCombinerInfo &DCI) { 18643 // BT ignores high bits in the bit index operand. 18644 SDValue Op1 = N->getOperand(1); 18645 if (Op1.hasOneUse()) { 18646 unsigned BitWidth = Op1.getValueSizeInBits(); 18647 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 18648 APInt KnownZero, KnownOne; 18649 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 18650 !DCI.isBeforeLegalizeOps()); 18651 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18652 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 18653 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 18654 DCI.CommitTargetLoweringOpt(TLO); 18655 } 18656 return SDValue(); 18657} 18658 18659static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 18660 SDValue Op = N->getOperand(0); 18661 if (Op.getOpcode() == ISD::BITCAST) 18662 Op = Op.getOperand(0); 18663 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 18664 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 18665 VT.getVectorElementType().getSizeInBits() == 18666 OpVT.getVectorElementType().getSizeInBits()) { 18667 return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 18668 } 18669 return SDValue(); 18670} 18671 18672static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, 18673 const X86Subtarget *Subtarget) { 18674 EVT VT = N->getValueType(0); 18675 if (!VT.isVector()) 18676 return SDValue(); 18677 18678 SDValue N0 = N->getOperand(0); 18679 SDValue N1 = N->getOperand(1); 18680 EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); 18681 SDLoc dl(N); 18682 18683 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the 18684 // both SSE and AVX2 since there is no sign-extended shift right 18685 // operation on a vector with 64-bit elements. 18686 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> 18687 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) 18688 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || 18689 N0.getOpcode() == ISD::SIGN_EXTEND)) { 18690 SDValue N00 = N0.getOperand(0); 18691 18692 // EXTLOAD has a better solution on AVX2, 18693 // it may be replaced with X86ISD::VSEXT node. 18694 if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) 18695 if (!ISD::isNormalLoad(N00.getNode())) 18696 return SDValue(); 18697 18698 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { 18699 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 18700 N00, N1); 18701 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); 18702 } 18703 } 18704 return SDValue(); 18705} 18706 18707static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, 18708 TargetLowering::DAGCombinerInfo &DCI, 18709 const X86Subtarget *Subtarget) { 18710 if (!DCI.isBeforeLegalizeOps()) 18711 return SDValue(); 18712 18713 if (!Subtarget->hasFp256()) 18714 return SDValue(); 18715 18716 EVT VT = N->getValueType(0); 18717 if (VT.isVector() && VT.getSizeInBits() == 256) { 18718 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 18719 if (R.getNode()) 18720 return R; 18721 } 18722 18723 return SDValue(); 18724} 18725 18726static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, 18727 const X86Subtarget* Subtarget) { 18728 SDLoc dl(N); 18729 EVT VT = N->getValueType(0); 18730 18731 // Let legalize expand this if it isn't a legal type yet. 18732 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 18733 return SDValue(); 18734 18735 EVT ScalarVT = VT.getScalarType(); 18736 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || 18737 (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) 18738 return SDValue(); 18739 18740 SDValue A = N->getOperand(0); 18741 SDValue B = N->getOperand(1); 18742 SDValue C = N->getOperand(2); 18743 18744 bool NegA = (A.getOpcode() == ISD::FNEG); 18745 bool NegB = (B.getOpcode() == ISD::FNEG); 18746 bool NegC = (C.getOpcode() == ISD::FNEG); 18747 18748 // Negative multiplication when NegA xor NegB 18749 bool NegMul = (NegA != NegB); 18750 if (NegA) 18751 A = A.getOperand(0); 18752 if (NegB) 18753 B = B.getOperand(0); 18754 if (NegC) 18755 C = C.getOperand(0); 18756 18757 unsigned Opcode; 18758 if (!NegMul) 18759 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; 18760 else 18761 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; 18762 18763 return DAG.getNode(Opcode, dl, VT, A, B, C); 18764} 18765 18766static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, 18767 TargetLowering::DAGCombinerInfo &DCI, 18768 const X86Subtarget *Subtarget) { 18769 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 18770 // (and (i32 x86isd::setcc_carry), 1) 18771 // This eliminates the zext. This transformation is necessary because 18772 // ISD::SETCC is always legalized to i8. 18773 SDLoc dl(N); 18774 SDValue N0 = N->getOperand(0); 18775 EVT VT = N->getValueType(0); 18776 18777 if (N0.getOpcode() == ISD::AND && 18778 N0.hasOneUse() && 18779 N0.getOperand(0).hasOneUse()) { 18780 SDValue N00 = N0.getOperand(0); 18781 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 18782 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 18783 if (!C || C->getZExtValue() != 1) 18784 return SDValue(); 18785 return DAG.getNode(ISD::AND, dl, VT, 18786 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 18787 N00.getOperand(0), N00.getOperand(1)), 18788 DAG.getConstant(1, VT)); 18789 } 18790 } 18791 18792 if (VT.is256BitVector()) { 18793 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 18794 if (R.getNode()) 18795 return R; 18796 } 18797 18798 return SDValue(); 18799} 18800 18801// Optimize x == -y --> x+y == 0 18802// x != -y --> x+y != 0 18803static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { 18804 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 18805 SDValue LHS = N->getOperand(0); 18806 SDValue RHS = N->getOperand(1); 18807 18808 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) 18809 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) 18810 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { 18811 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), 18812 LHS.getValueType(), RHS, LHS.getOperand(1)); 18813 return DAG.getSetCC(SDLoc(N), N->getValueType(0), 18814 addV, DAG.getConstant(0, addV.getValueType()), CC); 18815 } 18816 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) 18817 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) 18818 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { 18819 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), 18820 RHS.getValueType(), LHS, RHS.getOperand(1)); 18821 return DAG.getSetCC(SDLoc(N), N->getValueType(0), 18822 addV, DAG.getConstant(0, addV.getValueType()), CC); 18823 } 18824 return SDValue(); 18825} 18826 18827// Helper function of PerformSETCCCombine. It is to materialize "setb reg" 18828// as "sbb reg,reg", since it can be extended without zext and produces 18829// an all-ones bit which is more useful than 0/1 in some cases. 18830static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { 18831 return DAG.getNode(ISD::AND, DL, MVT::i8, 18832 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 18833 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), 18834 DAG.getConstant(1, MVT::i8)); 18835} 18836 18837// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 18838static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, 18839 TargetLowering::DAGCombinerInfo &DCI, 18840 const X86Subtarget *Subtarget) { 18841 SDLoc DL(N); 18842 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); 18843 SDValue EFLAGS = N->getOperand(1); 18844 18845 if (CC == X86::COND_A) { 18846 // Try to convert COND_A into COND_B in an attempt to facilitate 18847 // materializing "setb reg". 18848 // 18849 // Do not flip "e > c", where "c" is a constant, because Cmp instruction 18850 // cannot take an immediate as its first operand. 18851 // 18852 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 18853 EFLAGS.getValueType().isInteger() && 18854 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { 18855 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), 18856 EFLAGS.getNode()->getVTList(), 18857 EFLAGS.getOperand(1), EFLAGS.getOperand(0)); 18858 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); 18859 return MaterializeSETB(DL, NewEFLAGS, DAG); 18860 } 18861 } 18862 18863 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 18864 // a zext and produces an all-ones bit which is more useful than 0/1 in some 18865 // cases. 18866 if (CC == X86::COND_B) 18867 return MaterializeSETB(DL, EFLAGS, DAG); 18868 18869 SDValue Flags; 18870 18871 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 18872 if (Flags.getNode()) { 18873 SDValue Cond = DAG.getConstant(CC, MVT::i8); 18874 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 18875 } 18876 18877 return SDValue(); 18878} 18879 18880// Optimize branch condition evaluation. 18881// 18882static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, 18883 TargetLowering::DAGCombinerInfo &DCI, 18884 const X86Subtarget *Subtarget) { 18885 SDLoc DL(N); 18886 SDValue Chain = N->getOperand(0); 18887 SDValue Dest = N->getOperand(1); 18888 SDValue EFLAGS = N->getOperand(3); 18889 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); 18890 18891 SDValue Flags; 18892 18893 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 18894 if (Flags.getNode()) { 18895 SDValue Cond = DAG.getConstant(CC, MVT::i8); 18896 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, 18897 Flags); 18898 } 18899 18900 return SDValue(); 18901} 18902 18903static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 18904 const X86TargetLowering *XTLI) { 18905 SDValue Op0 = N->getOperand(0); 18906 EVT InVT = Op0->getValueType(0); 18907 18908 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) 18909 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { 18910 SDLoc dl(N); 18911 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 18912 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); 18913 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); 18914 } 18915 18916 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 18917 // a 32-bit target where SSE doesn't support i64->FP operations. 18918 if (Op0.getOpcode() == ISD::LOAD) { 18919 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 18920 EVT VT = Ld->getValueType(0); 18921 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 18922 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 18923 !XTLI->getSubtarget()->is64Bit() && 18924 VT == MVT::i64) { 18925 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 18926 Ld->getChain(), Op0, DAG); 18927 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 18928 return FILDChain; 18929 } 18930 } 18931 return SDValue(); 18932} 18933 18934// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 18935static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 18936 X86TargetLowering::DAGCombinerInfo &DCI) { 18937 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 18938 // the result is either zero or one (depending on the input carry bit). 18939 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 18940 if (X86::isZeroNode(N->getOperand(0)) && 18941 X86::isZeroNode(N->getOperand(1)) && 18942 // We don't have a good way to replace an EFLAGS use, so only do this when 18943 // dead right now. 18944 SDValue(N, 1).use_empty()) { 18945 SDLoc DL(N); 18946 EVT VT = N->getValueType(0); 18947 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 18948 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 18949 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 18950 DAG.getConstant(X86::COND_B,MVT::i8), 18951 N->getOperand(2)), 18952 DAG.getConstant(1, VT)); 18953 return DCI.CombineTo(N, Res1, CarryOut); 18954 } 18955 18956 return SDValue(); 18957} 18958 18959// fold (add Y, (sete X, 0)) -> adc 0, Y 18960// (add Y, (setne X, 0)) -> sbb -1, Y 18961// (sub (sete X, 0), Y) -> sbb 0, Y 18962// (sub (setne X, 0), Y) -> adc -1, Y 18963static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 18964 SDLoc DL(N); 18965 18966 // Look through ZExts. 18967 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 18968 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 18969 return SDValue(); 18970 18971 SDValue SetCC = Ext.getOperand(0); 18972 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 18973 return SDValue(); 18974 18975 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 18976 if (CC != X86::COND_E && CC != X86::COND_NE) 18977 return SDValue(); 18978 18979 SDValue Cmp = SetCC.getOperand(1); 18980 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 18981 !X86::isZeroNode(Cmp.getOperand(1)) || 18982 !Cmp.getOperand(0).getValueType().isInteger()) 18983 return SDValue(); 18984 18985 SDValue CmpOp0 = Cmp.getOperand(0); 18986 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 18987 DAG.getConstant(1, CmpOp0.getValueType())); 18988 18989 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 18990 if (CC == X86::COND_NE) 18991 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 18992 DL, OtherVal.getValueType(), OtherVal, 18993 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 18994 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 18995 DL, OtherVal.getValueType(), OtherVal, 18996 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 18997} 18998 18999/// PerformADDCombine - Do target-specific dag combines on integer adds. 19000static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 19001 const X86Subtarget *Subtarget) { 19002 EVT VT = N->getValueType(0); 19003 SDValue Op0 = N->getOperand(0); 19004 SDValue Op1 = N->getOperand(1); 19005 19006 // Try to synthesize horizontal adds from adds of shuffles. 19007 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 19008 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 19009 isHorizontalBinOp(Op0, Op1, true)) 19010 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); 19011 19012 return OptimizeConditionalInDecrement(N, DAG); 19013} 19014 19015static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 19016 const X86Subtarget *Subtarget) { 19017 SDValue Op0 = N->getOperand(0); 19018 SDValue Op1 = N->getOperand(1); 19019 19020 // X86 can't encode an immediate LHS of a sub. See if we can push the 19021 // negation into a preceding instruction. 19022 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 19023 // If the RHS of the sub is a XOR with one use and a constant, invert the 19024 // immediate. Then add one to the LHS of the sub so we can turn 19025 // X-Y -> X+~Y+1, saving one register. 19026 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 19027 isa<ConstantSDNode>(Op1.getOperand(1))) { 19028 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 19029 EVT VT = Op0.getValueType(); 19030 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, 19031 Op1.getOperand(0), 19032 DAG.getConstant(~XorC, VT)); 19033 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, 19034 DAG.getConstant(C->getAPIntValue()+1, VT)); 19035 } 19036 } 19037 19038 // Try to synthesize horizontal adds from adds of shuffles. 19039 EVT VT = N->getValueType(0); 19040 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 19041 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 19042 isHorizontalBinOp(Op0, Op1, true)) 19043 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); 19044 19045 return OptimizeConditionalInDecrement(N, DAG); 19046} 19047 19048/// performVZEXTCombine - Performs build vector combines 19049static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, 19050 TargetLowering::DAGCombinerInfo &DCI, 19051 const X86Subtarget *Subtarget) { 19052 // (vzext (bitcast (vzext (x)) -> (vzext x) 19053 SDValue In = N->getOperand(0); 19054 while (In.getOpcode() == ISD::BITCAST) 19055 In = In.getOperand(0); 19056 19057 if (In.getOpcode() != X86ISD::VZEXT) 19058 return SDValue(); 19059 19060 return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0), 19061 In.getOperand(0)); 19062} 19063 19064SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 19065 DAGCombinerInfo &DCI) const { 19066 SelectionDAG &DAG = DCI.DAG; 19067 switch (N->getOpcode()) { 19068 default: break; 19069 case ISD::EXTRACT_VECTOR_ELT: 19070 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); 19071 case ISD::VSELECT: 19072 case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); 19073 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); 19074 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 19075 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 19076 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 19077 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 19078 case ISD::SHL: 19079 case ISD::SRA: 19080 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); 19081 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 19082 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 19083 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 19084 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); 19085 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 19086 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 19087 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 19088 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 19089 case X86ISD::FXOR: 19090 case X86ISD::FOR: return PerformFORCombine(N, DAG); 19091 case X86ISD::FMIN: 19092 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); 19093 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 19094 case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); 19095 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 19096 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 19097 case ISD::ANY_EXTEND: 19098 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); 19099 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); 19100 case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); 19101 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); 19102 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); 19103 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); 19104 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); 19105 case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); 19106 case X86ISD::SHUFP: // Handle all target specific shuffles 19107 case X86ISD::PALIGNR: 19108 case X86ISD::UNPCKH: 19109 case X86ISD::UNPCKL: 19110 case X86ISD::MOVHLPS: 19111 case X86ISD::MOVLHPS: 19112 case X86ISD::PSHUFD: 19113 case X86ISD::PSHUFHW: 19114 case X86ISD::PSHUFLW: 19115 case X86ISD::MOVSS: 19116 case X86ISD::MOVSD: 19117 case X86ISD::VPERMILP: 19118 case X86ISD::VPERM2X128: 19119 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 19120 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); 19121 } 19122 19123 return SDValue(); 19124} 19125 19126/// isTypeDesirableForOp - Return true if the target has native support for 19127/// the specified value type and it is 'desirable' to use the type for the 19128/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 19129/// instruction encodings are longer and some i16 instructions are slow. 19130bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 19131 if (!isTypeLegal(VT)) 19132 return false; 19133 if (VT != MVT::i16) 19134 return true; 19135 19136 switch (Opc) { 19137 default: 19138 return true; 19139 case ISD::LOAD: 19140 case ISD::SIGN_EXTEND: 19141 case ISD::ZERO_EXTEND: 19142 case ISD::ANY_EXTEND: 19143 case ISD::SHL: 19144 case ISD::SRL: 19145 case ISD::SUB: 19146 case ISD::ADD: 19147 case ISD::MUL: 19148 case ISD::AND: 19149 case ISD::OR: 19150 case ISD::XOR: 19151 return false; 19152 } 19153} 19154 19155/// IsDesirableToPromoteOp - This method query the target whether it is 19156/// beneficial for dag combiner to promote the specified node. If true, it 19157/// should return the desired promotion type by reference. 19158bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 19159 EVT VT = Op.getValueType(); 19160 if (VT != MVT::i16) 19161 return false; 19162 19163 bool Promote = false; 19164 bool Commute = false; 19165 switch (Op.getOpcode()) { 19166 default: break; 19167 case ISD::LOAD: { 19168 LoadSDNode *LD = cast<LoadSDNode>(Op); 19169 // If the non-extending load has a single use and it's not live out, then it 19170 // might be folded. 19171 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 19172 Op.hasOneUse()*/) { 19173 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 19174 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 19175 // The only case where we'd want to promote LOAD (rather then it being 19176 // promoted as an operand is when it's only use is liveout. 19177 if (UI->getOpcode() != ISD::CopyToReg) 19178 return false; 19179 } 19180 } 19181 Promote = true; 19182 break; 19183 } 19184 case ISD::SIGN_EXTEND: 19185 case ISD::ZERO_EXTEND: 19186 case ISD::ANY_EXTEND: 19187 Promote = true; 19188 break; 19189 case ISD::SHL: 19190 case ISD::SRL: { 19191 SDValue N0 = Op.getOperand(0); 19192 // Look out for (store (shl (load), x)). 19193 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 19194 return false; 19195 Promote = true; 19196 break; 19197 } 19198 case ISD::ADD: 19199 case ISD::MUL: 19200 case ISD::AND: 19201 case ISD::OR: 19202 case ISD::XOR: 19203 Commute = true; 19204 // fallthrough 19205 case ISD::SUB: { 19206 SDValue N0 = Op.getOperand(0); 19207 SDValue N1 = Op.getOperand(1); 19208 if (!Commute && MayFoldLoad(N1)) 19209 return false; 19210 // Avoid disabling potential load folding opportunities. 19211 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 19212 return false; 19213 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 19214 return false; 19215 Promote = true; 19216 } 19217 } 19218 19219 PVT = MVT::i32; 19220 return Promote; 19221} 19222 19223//===----------------------------------------------------------------------===// 19224// X86 Inline Assembly Support 19225//===----------------------------------------------------------------------===// 19226 19227namespace { 19228 // Helper to match a string separated by whitespace. 19229 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { 19230 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. 19231 19232 for (unsigned i = 0, e = args.size(); i != e; ++i) { 19233 StringRef piece(*args[i]); 19234 if (!s.startswith(piece)) // Check if the piece matches. 19235 return false; 19236 19237 s = s.substr(piece.size()); 19238 StringRef::size_type pos = s.find_first_not_of(" \t"); 19239 if (pos == 0) // We matched a prefix. 19240 return false; 19241 19242 s = s.substr(pos); 19243 } 19244 19245 return s.empty(); 19246 } 19247 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; 19248} 19249 19250static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { 19251 19252 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { 19253 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && 19254 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && 19255 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { 19256 19257 if (AsmPieces.size() == 3) 19258 return true; 19259 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) 19260 return true; 19261 } 19262 } 19263 return false; 19264} 19265 19266bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 19267 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 19268 19269 std::string AsmStr = IA->getAsmString(); 19270 19271 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 19272 if (!Ty || Ty->getBitWidth() % 16 != 0) 19273 return false; 19274 19275 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 19276 SmallVector<StringRef, 4> AsmPieces; 19277 SplitString(AsmStr, AsmPieces, ";\n"); 19278 19279 switch (AsmPieces.size()) { 19280 default: return false; 19281 case 1: 19282 // FIXME: this should verify that we are targeting a 486 or better. If not, 19283 // we will turn this bswap into something that will be lowered to logical 19284 // ops instead of emitting the bswap asm. For now, we don't support 486 or 19285 // lower so don't worry about this. 19286 // bswap $0 19287 if (matchAsm(AsmPieces[0], "bswap", "$0") || 19288 matchAsm(AsmPieces[0], "bswapl", "$0") || 19289 matchAsm(AsmPieces[0], "bswapq", "$0") || 19290 matchAsm(AsmPieces[0], "bswap", "${0:q}") || 19291 matchAsm(AsmPieces[0], "bswapl", "${0:q}") || 19292 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { 19293 // No need to check constraints, nothing other than the equivalent of 19294 // "=r,0" would be valid here. 19295 return IntrinsicLowering::LowerToByteSwap(CI); 19296 } 19297 19298 // rorw $$8, ${0:w} --> llvm.bswap.i16 19299 if (CI->getType()->isIntegerTy(16) && 19300 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 19301 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || 19302 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { 19303 AsmPieces.clear(); 19304 const std::string &ConstraintsStr = IA->getConstraintString(); 19305 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 19306 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 19307 if (clobbersFlagRegisters(AsmPieces)) 19308 return IntrinsicLowering::LowerToByteSwap(CI); 19309 } 19310 break; 19311 case 3: 19312 if (CI->getType()->isIntegerTy(32) && 19313 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 19314 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && 19315 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && 19316 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { 19317 AsmPieces.clear(); 19318 const std::string &ConstraintsStr = IA->getConstraintString(); 19319 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 19320 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 19321 if (clobbersFlagRegisters(AsmPieces)) 19322 return IntrinsicLowering::LowerToByteSwap(CI); 19323 } 19324 19325 if (CI->getType()->isIntegerTy(64)) { 19326 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 19327 if (Constraints.size() >= 2 && 19328 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 19329 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 19330 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 19331 if (matchAsm(AsmPieces[0], "bswap", "%eax") && 19332 matchAsm(AsmPieces[1], "bswap", "%edx") && 19333 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) 19334 return IntrinsicLowering::LowerToByteSwap(CI); 19335 } 19336 } 19337 break; 19338 } 19339 return false; 19340} 19341 19342/// getConstraintType - Given a constraint letter, return the type of 19343/// constraint it is for this target. 19344X86TargetLowering::ConstraintType 19345X86TargetLowering::getConstraintType(const std::string &Constraint) const { 19346 if (Constraint.size() == 1) { 19347 switch (Constraint[0]) { 19348 case 'R': 19349 case 'q': 19350 case 'Q': 19351 case 'f': 19352 case 't': 19353 case 'u': 19354 case 'y': 19355 case 'x': 19356 case 'Y': 19357 case 'l': 19358 return C_RegisterClass; 19359 case 'a': 19360 case 'b': 19361 case 'c': 19362 case 'd': 19363 case 'S': 19364 case 'D': 19365 case 'A': 19366 return C_Register; 19367 case 'I': 19368 case 'J': 19369 case 'K': 19370 case 'L': 19371 case 'M': 19372 case 'N': 19373 case 'G': 19374 case 'C': 19375 case 'e': 19376 case 'Z': 19377 return C_Other; 19378 default: 19379 break; 19380 } 19381 } 19382 return TargetLowering::getConstraintType(Constraint); 19383} 19384 19385/// Examine constraint type and operand type and determine a weight value. 19386/// This object must already have been set up with the operand type 19387/// and the current alternative constraint selected. 19388TargetLowering::ConstraintWeight 19389 X86TargetLowering::getSingleConstraintMatchWeight( 19390 AsmOperandInfo &info, const char *constraint) const { 19391 ConstraintWeight weight = CW_Invalid; 19392 Value *CallOperandVal = info.CallOperandVal; 19393 // If we don't have a value, we can't do a match, 19394 // but allow it at the lowest weight. 19395 if (CallOperandVal == NULL) 19396 return CW_Default; 19397 Type *type = CallOperandVal->getType(); 19398 // Look at the constraint type. 19399 switch (*constraint) { 19400 default: 19401 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 19402 case 'R': 19403 case 'q': 19404 case 'Q': 19405 case 'a': 19406 case 'b': 19407 case 'c': 19408 case 'd': 19409 case 'S': 19410 case 'D': 19411 case 'A': 19412 if (CallOperandVal->getType()->isIntegerTy()) 19413 weight = CW_SpecificReg; 19414 break; 19415 case 'f': 19416 case 't': 19417 case 'u': 19418 if (type->isFloatingPointTy()) 19419 weight = CW_SpecificReg; 19420 break; 19421 case 'y': 19422 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 19423 weight = CW_SpecificReg; 19424 break; 19425 case 'x': 19426 case 'Y': 19427 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || 19428 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) 19429 weight = CW_Register; 19430 break; 19431 case 'I': 19432 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 19433 if (C->getZExtValue() <= 31) 19434 weight = CW_Constant; 19435 } 19436 break; 19437 case 'J': 19438 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19439 if (C->getZExtValue() <= 63) 19440 weight = CW_Constant; 19441 } 19442 break; 19443 case 'K': 19444 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19445 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 19446 weight = CW_Constant; 19447 } 19448 break; 19449 case 'L': 19450 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19451 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 19452 weight = CW_Constant; 19453 } 19454 break; 19455 case 'M': 19456 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19457 if (C->getZExtValue() <= 3) 19458 weight = CW_Constant; 19459 } 19460 break; 19461 case 'N': 19462 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19463 if (C->getZExtValue() <= 0xff) 19464 weight = CW_Constant; 19465 } 19466 break; 19467 case 'G': 19468 case 'C': 19469 if (dyn_cast<ConstantFP>(CallOperandVal)) { 19470 weight = CW_Constant; 19471 } 19472 break; 19473 case 'e': 19474 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19475 if ((C->getSExtValue() >= -0x80000000LL) && 19476 (C->getSExtValue() <= 0x7fffffffLL)) 19477 weight = CW_Constant; 19478 } 19479 break; 19480 case 'Z': 19481 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 19482 if (C->getZExtValue() <= 0xffffffff) 19483 weight = CW_Constant; 19484 } 19485 break; 19486 } 19487 return weight; 19488} 19489 19490/// LowerXConstraint - try to replace an X constraint, which matches anything, 19491/// with another that has more specific requirements based on the type of the 19492/// corresponding operand. 19493const char *X86TargetLowering:: 19494LowerXConstraint(EVT ConstraintVT) const { 19495 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 19496 // 'f' like normal targets. 19497 if (ConstraintVT.isFloatingPoint()) { 19498 if (Subtarget->hasSSE2()) 19499 return "Y"; 19500 if (Subtarget->hasSSE1()) 19501 return "x"; 19502 } 19503 19504 return TargetLowering::LowerXConstraint(ConstraintVT); 19505} 19506 19507/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 19508/// vector. If it is invalid, don't add anything to Ops. 19509void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 19510 std::string &Constraint, 19511 std::vector<SDValue>&Ops, 19512 SelectionDAG &DAG) const { 19513 SDValue Result(0, 0); 19514 19515 // Only support length 1 constraints for now. 19516 if (Constraint.length() > 1) return; 19517 19518 char ConstraintLetter = Constraint[0]; 19519 switch (ConstraintLetter) { 19520 default: break; 19521 case 'I': 19522 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19523 if (C->getZExtValue() <= 31) { 19524 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19525 break; 19526 } 19527 } 19528 return; 19529 case 'J': 19530 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19531 if (C->getZExtValue() <= 63) { 19532 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19533 break; 19534 } 19535 } 19536 return; 19537 case 'K': 19538 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19539 if (isInt<8>(C->getSExtValue())) { 19540 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19541 break; 19542 } 19543 } 19544 return; 19545 case 'N': 19546 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19547 if (C->getZExtValue() <= 255) { 19548 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19549 break; 19550 } 19551 } 19552 return; 19553 case 'e': { 19554 // 32-bit signed value 19555 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19556 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 19557 C->getSExtValue())) { 19558 // Widen to 64 bits here to get it sign extended. 19559 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 19560 break; 19561 } 19562 // FIXME gcc accepts some relocatable values here too, but only in certain 19563 // memory models; it's complicated. 19564 } 19565 return; 19566 } 19567 case 'Z': { 19568 // 32-bit unsigned value 19569 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 19570 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 19571 C->getZExtValue())) { 19572 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 19573 break; 19574 } 19575 } 19576 // FIXME gcc accepts some relocatable values here too, but only in certain 19577 // memory models; it's complicated. 19578 return; 19579 } 19580 case 'i': { 19581 // Literal immediates are always ok. 19582 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 19583 // Widen to 64 bits here to get it sign extended. 19584 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 19585 break; 19586 } 19587 19588 // In any sort of PIC mode addresses need to be computed at runtime by 19589 // adding in a register or some sort of table lookup. These can't 19590 // be used as immediates. 19591 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 19592 return; 19593 19594 // If we are in non-pic codegen mode, we allow the address of a global (with 19595 // an optional displacement) to be used with 'i'. 19596 GlobalAddressSDNode *GA = 0; 19597 int64_t Offset = 0; 19598 19599 // Match either (GA), (GA+C), (GA+C1+C2), etc. 19600 while (1) { 19601 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 19602 Offset += GA->getOffset(); 19603 break; 19604 } else if (Op.getOpcode() == ISD::ADD) { 19605 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 19606 Offset += C->getZExtValue(); 19607 Op = Op.getOperand(0); 19608 continue; 19609 } 19610 } else if (Op.getOpcode() == ISD::SUB) { 19611 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 19612 Offset += -C->getZExtValue(); 19613 Op = Op.getOperand(0); 19614 continue; 19615 } 19616 } 19617 19618 // Otherwise, this isn't something we can handle, reject it. 19619 return; 19620 } 19621 19622 const GlobalValue *GV = GA->getGlobal(); 19623 // If we require an extra load to get this address, as in PIC mode, we 19624 // can't accept it. 19625 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 19626 getTargetMachine()))) 19627 return; 19628 19629 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), 19630 GA->getValueType(0), Offset); 19631 break; 19632 } 19633 } 19634 19635 if (Result.getNode()) { 19636 Ops.push_back(Result); 19637 return; 19638 } 19639 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 19640} 19641 19642std::pair<unsigned, const TargetRegisterClass*> 19643X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 19644 MVT VT) const { 19645 // First, see if this is a constraint that directly corresponds to an LLVM 19646 // register class. 19647 if (Constraint.size() == 1) { 19648 // GCC Constraint Letters 19649 switch (Constraint[0]) { 19650 default: break; 19651 // TODO: Slight differences here in allocation order and leaving 19652 // RIP in the class. Do they matter any more here than they do 19653 // in the normal allocation? 19654 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 19655 if (Subtarget->is64Bit()) { 19656 if (VT == MVT::i32 || VT == MVT::f32) 19657 return std::make_pair(0U, &X86::GR32RegClass); 19658 if (VT == MVT::i16) 19659 return std::make_pair(0U, &X86::GR16RegClass); 19660 if (VT == MVT::i8 || VT == MVT::i1) 19661 return std::make_pair(0U, &X86::GR8RegClass); 19662 if (VT == MVT::i64 || VT == MVT::f64) 19663 return std::make_pair(0U, &X86::GR64RegClass); 19664 break; 19665 } 19666 // 32-bit fallthrough 19667 case 'Q': // Q_REGS 19668 if (VT == MVT::i32 || VT == MVT::f32) 19669 return std::make_pair(0U, &X86::GR32_ABCDRegClass); 19670 if (VT == MVT::i16) 19671 return std::make_pair(0U, &X86::GR16_ABCDRegClass); 19672 if (VT == MVT::i8 || VT == MVT::i1) 19673 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); 19674 if (VT == MVT::i64) 19675 return std::make_pair(0U, &X86::GR64_ABCDRegClass); 19676 break; 19677 case 'r': // GENERAL_REGS 19678 case 'l': // INDEX_REGS 19679 if (VT == MVT::i8 || VT == MVT::i1) 19680 return std::make_pair(0U, &X86::GR8RegClass); 19681 if (VT == MVT::i16) 19682 return std::make_pair(0U, &X86::GR16RegClass); 19683 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 19684 return std::make_pair(0U, &X86::GR32RegClass); 19685 return std::make_pair(0U, &X86::GR64RegClass); 19686 case 'R': // LEGACY_REGS 19687 if (VT == MVT::i8 || VT == MVT::i1) 19688 return std::make_pair(0U, &X86::GR8_NOREXRegClass); 19689 if (VT == MVT::i16) 19690 return std::make_pair(0U, &X86::GR16_NOREXRegClass); 19691 if (VT == MVT::i32 || !Subtarget->is64Bit()) 19692 return std::make_pair(0U, &X86::GR32_NOREXRegClass); 19693 return std::make_pair(0U, &X86::GR64_NOREXRegClass); 19694 case 'f': // FP Stack registers. 19695 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 19696 // value to the correct fpstack register class. 19697 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 19698 return std::make_pair(0U, &X86::RFP32RegClass); 19699 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 19700 return std::make_pair(0U, &X86::RFP64RegClass); 19701 return std::make_pair(0U, &X86::RFP80RegClass); 19702 case 'y': // MMX_REGS if MMX allowed. 19703 if (!Subtarget->hasMMX()) break; 19704 return std::make_pair(0U, &X86::VR64RegClass); 19705 case 'Y': // SSE_REGS if SSE2 allowed 19706 if (!Subtarget->hasSSE2()) break; 19707 // FALL THROUGH. 19708 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 19709 if (!Subtarget->hasSSE1()) break; 19710 19711 switch (VT.SimpleTy) { 19712 default: break; 19713 // Scalar SSE types. 19714 case MVT::f32: 19715 case MVT::i32: 19716 return std::make_pair(0U, &X86::FR32RegClass); 19717 case MVT::f64: 19718 case MVT::i64: 19719 return std::make_pair(0U, &X86::FR64RegClass); 19720 // Vector types. 19721 case MVT::v16i8: 19722 case MVT::v8i16: 19723 case MVT::v4i32: 19724 case MVT::v2i64: 19725 case MVT::v4f32: 19726 case MVT::v2f64: 19727 return std::make_pair(0U, &X86::VR128RegClass); 19728 // AVX types. 19729 case MVT::v32i8: 19730 case MVT::v16i16: 19731 case MVT::v8i32: 19732 case MVT::v4i64: 19733 case MVT::v8f32: 19734 case MVT::v4f64: 19735 return std::make_pair(0U, &X86::VR256RegClass); 19736 case MVT::v8f64: 19737 case MVT::v16f32: 19738 case MVT::v16i32: 19739 case MVT::v8i64: 19740 return std::make_pair(0U, &X86::VR512RegClass); 19741 } 19742 break; 19743 } 19744 } 19745 19746 // Use the default implementation in TargetLowering to convert the register 19747 // constraint into a member of a register class. 19748 std::pair<unsigned, const TargetRegisterClass*> Res; 19749 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 19750 19751 // Not found as a standard register? 19752 if (Res.second == 0) { 19753 // Map st(0) -> st(7) -> ST0 19754 if (Constraint.size() == 7 && Constraint[0] == '{' && 19755 tolower(Constraint[1]) == 's' && 19756 tolower(Constraint[2]) == 't' && 19757 Constraint[3] == '(' && 19758 (Constraint[4] >= '0' && Constraint[4] <= '7') && 19759 Constraint[5] == ')' && 19760 Constraint[6] == '}') { 19761 19762 Res.first = X86::ST0+Constraint[4]-'0'; 19763 Res.second = &X86::RFP80RegClass; 19764 return Res; 19765 } 19766 19767 // GCC allows "st(0)" to be called just plain "st". 19768 if (StringRef("{st}").equals_lower(Constraint)) { 19769 Res.first = X86::ST0; 19770 Res.second = &X86::RFP80RegClass; 19771 return Res; 19772 } 19773 19774 // flags -> EFLAGS 19775 if (StringRef("{flags}").equals_lower(Constraint)) { 19776 Res.first = X86::EFLAGS; 19777 Res.second = &X86::CCRRegClass; 19778 return Res; 19779 } 19780 19781 // 'A' means EAX + EDX. 19782 if (Constraint == "A") { 19783 Res.first = X86::EAX; 19784 Res.second = &X86::GR32_ADRegClass; 19785 return Res; 19786 } 19787 return Res; 19788 } 19789 19790 // Otherwise, check to see if this is a register class of the wrong value 19791 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 19792 // turn into {ax},{dx}. 19793 if (Res.second->hasType(VT)) 19794 return Res; // Correct type already, nothing to do. 19795 19796 // All of the single-register GCC register classes map their values onto 19797 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 19798 // really want an 8-bit or 32-bit register, map to the appropriate register 19799 // class and return the appropriate register. 19800 if (Res.second == &X86::GR16RegClass) { 19801 if (VT == MVT::i8 || VT == MVT::i1) { 19802 unsigned DestReg = 0; 19803 switch (Res.first) { 19804 default: break; 19805 case X86::AX: DestReg = X86::AL; break; 19806 case X86::DX: DestReg = X86::DL; break; 19807 case X86::CX: DestReg = X86::CL; break; 19808 case X86::BX: DestReg = X86::BL; break; 19809 } 19810 if (DestReg) { 19811 Res.first = DestReg; 19812 Res.second = &X86::GR8RegClass; 19813 } 19814 } else if (VT == MVT::i32 || VT == MVT::f32) { 19815 unsigned DestReg = 0; 19816 switch (Res.first) { 19817 default: break; 19818 case X86::AX: DestReg = X86::EAX; break; 19819 case X86::DX: DestReg = X86::EDX; break; 19820 case X86::CX: DestReg = X86::ECX; break; 19821 case X86::BX: DestReg = X86::EBX; break; 19822 case X86::SI: DestReg = X86::ESI; break; 19823 case X86::DI: DestReg = X86::EDI; break; 19824 case X86::BP: DestReg = X86::EBP; break; 19825 case X86::SP: DestReg = X86::ESP; break; 19826 } 19827 if (DestReg) { 19828 Res.first = DestReg; 19829 Res.second = &X86::GR32RegClass; 19830 } 19831 } else if (VT == MVT::i64 || VT == MVT::f64) { 19832 unsigned DestReg = 0; 19833 switch (Res.first) { 19834 default: break; 19835 case X86::AX: DestReg = X86::RAX; break; 19836 case X86::DX: DestReg = X86::RDX; break; 19837 case X86::CX: DestReg = X86::RCX; break; 19838 case X86::BX: DestReg = X86::RBX; break; 19839 case X86::SI: DestReg = X86::RSI; break; 19840 case X86::DI: DestReg = X86::RDI; break; 19841 case X86::BP: DestReg = X86::RBP; break; 19842 case X86::SP: DestReg = X86::RSP; break; 19843 } 19844 if (DestReg) { 19845 Res.first = DestReg; 19846 Res.second = &X86::GR64RegClass; 19847 } 19848 } 19849 } else if (Res.second == &X86::FR32RegClass || 19850 Res.second == &X86::FR64RegClass || 19851 Res.second == &X86::VR128RegClass || 19852 Res.second == &X86::VR256RegClass || 19853 Res.second == &X86::FR32XRegClass || 19854 Res.second == &X86::FR64XRegClass || 19855 Res.second == &X86::VR128XRegClass || 19856 Res.second == &X86::VR256XRegClass || 19857 Res.second == &X86::VR512RegClass) { 19858 // Handle references to XMM physical registers that got mapped into the 19859 // wrong class. This can happen with constraints like {xmm0} where the 19860 // target independent register mapper will just pick the first match it can 19861 // find, ignoring the required type. 19862 19863 if (VT == MVT::f32 || VT == MVT::i32) 19864 Res.second = &X86::FR32RegClass; 19865 else if (VT == MVT::f64 || VT == MVT::i64) 19866 Res.second = &X86::FR64RegClass; 19867 else if (X86::VR128RegClass.hasType(VT)) 19868 Res.second = &X86::VR128RegClass; 19869 else if (X86::VR256RegClass.hasType(VT)) 19870 Res.second = &X86::VR256RegClass; 19871 else if (X86::VR512RegClass.hasType(VT)) 19872 Res.second = &X86::VR512RegClass; 19873 } 19874 19875 return Res; 19876} 19877