X86ISelLowering.cpp revision d6fb53adb19ccfbfb1eedec11c899aaa8401d036
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86ISelLowering.h" 17#include "Utils/X86ShuffleDecode.h" 18#include "X86.h" 19#include "X86InstrBuilder.h" 20#include "X86TargetMachine.h" 21#include "X86TargetObjectFile.h" 22#include "llvm/ADT/SmallSet.h" 23#include "llvm/ADT/Statistic.h" 24#include "llvm/ADT/StringExtras.h" 25#include "llvm/ADT/VariadicFunction.h" 26#include "llvm/CallingConv.h" 27#include "llvm/CodeGen/IntrinsicLowering.h" 28#include "llvm/CodeGen/MachineFrameInfo.h" 29#include "llvm/CodeGen/MachineFunction.h" 30#include "llvm/CodeGen/MachineInstrBuilder.h" 31#include "llvm/CodeGen/MachineJumpTableInfo.h" 32#include "llvm/CodeGen/MachineModuleInfo.h" 33#include "llvm/CodeGen/MachineRegisterInfo.h" 34#include "llvm/Constants.h" 35#include "llvm/DerivedTypes.h" 36#include "llvm/Function.h" 37#include "llvm/GlobalAlias.h" 38#include "llvm/GlobalVariable.h" 39#include "llvm/Instructions.h" 40#include "llvm/Intrinsics.h" 41#include "llvm/LLVMContext.h" 42#include "llvm/MC/MCAsmInfo.h" 43#include "llvm/MC/MCContext.h" 44#include "llvm/MC/MCExpr.h" 45#include "llvm/MC/MCSymbol.h" 46#include "llvm/Support/CallSite.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/ErrorHandling.h" 49#include "llvm/Support/MathExtras.h" 50#include "llvm/Target/TargetOptions.h" 51#include <bitset> 52#include <cctype> 53using namespace llvm; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57// Forward declarations. 58static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 59 SDValue V2); 60 61/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 62/// sets things up to match to an AVX VEXTRACTF128 instruction or a 63/// simple subregister reference. Idx is an index in the 128 bits we 64/// want. It need not be aligned to a 128-bit bounday. That makes 65/// lowering EXTRACT_VECTOR_ELT operations easier. 66static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, 67 SelectionDAG &DAG, DebugLoc dl) { 68 EVT VT = Vec.getValueType(); 69 assert(VT.is256BitVector() && "Unexpected vector size!"); 70 EVT ElVT = VT.getVectorElementType(); 71 unsigned Factor = VT.getSizeInBits()/128; 72 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 73 VT.getVectorNumElements()/Factor); 74 75 // Extract from UNDEF is UNDEF. 76 if (Vec.getOpcode() == ISD::UNDEF) 77 return DAG.getUNDEF(ResultVT); 78 79 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 80 // we can match to VEXTRACTF128. 81 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 82 83 // This is the index of the first element of the 128-bit chunk 84 // we want. 85 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 86 * ElemsPerChunk); 87 88 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 89 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 90 VecIdx); 91 92 return Result; 93} 94 95/// Generate a DAG to put 128-bits into a vector > 128 bits. This 96/// sets things up to match to an AVX VINSERTF128 instruction or a 97/// simple superregister reference. Idx is an index in the 128 bits 98/// we want. It need not be aligned to a 128-bit bounday. That makes 99/// lowering INSERT_VECTOR_ELT operations easier. 100static SDValue Insert128BitVector(SDValue Result, SDValue Vec, 101 unsigned IdxVal, SelectionDAG &DAG, 102 DebugLoc dl) { 103 // Inserting UNDEF is Result 104 if (Vec.getOpcode() == ISD::UNDEF) 105 return Result; 106 107 EVT VT = Vec.getValueType(); 108 assert(VT.is128BitVector() && "Unexpected vector size!"); 109 110 EVT ElVT = VT.getVectorElementType(); 111 EVT ResultVT = Result.getValueType(); 112 113 // Insert the relevant 128 bits. 114 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 115 116 // This is the index of the first element of the 128-bit chunk 117 // we want. 118 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 119 * ElemsPerChunk); 120 121 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 122 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 123 VecIdx); 124} 125 126/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 127/// instructions. This is used because creating CONCAT_VECTOR nodes of 128/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 129/// large BUILD_VECTORS. 130static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 131 unsigned NumElems, SelectionDAG &DAG, 132 DebugLoc dl) { 133 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 134 return Insert128BitVector(V, V2, NumElems/2, DAG, dl); 135} 136 137static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 138 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 139 bool is64Bit = Subtarget->is64Bit(); 140 141 if (Subtarget->isTargetEnvMacho()) { 142 if (is64Bit) 143 return new X86_64MachoTargetObjectFile(); 144 return new TargetLoweringObjectFileMachO(); 145 } 146 147 if (Subtarget->isTargetLinux()) 148 return new X86LinuxTargetObjectFile(); 149 if (Subtarget->isTargetELF()) 150 return new TargetLoweringObjectFileELF(); 151 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 152 return new TargetLoweringObjectFileCOFF(); 153 llvm_unreachable("unknown subtarget type"); 154} 155 156X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 157 : TargetLowering(TM, createTLOF(TM)) { 158 Subtarget = &TM.getSubtarget<X86Subtarget>(); 159 X86ScalarSSEf64 = Subtarget->hasSSE2(); 160 X86ScalarSSEf32 = Subtarget->hasSSE1(); 161 162 RegInfo = TM.getRegisterInfo(); 163 TD = getDataLayout(); 164 165 // Set up the TargetLowering object. 166 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 167 168 // X86 is weird, it always uses i8 for shift amounts and setcc results. 169 setBooleanContents(ZeroOrOneBooleanContent); 170 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 171 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 172 173 // For 64-bit since we have so many registers use the ILP scheduler, for 174 // 32-bit code use the register pressure specific scheduling. 175 // For Atom, always use ILP scheduling. 176 if (Subtarget->isAtom()) 177 setSchedulingPreference(Sched::ILP); 178 else if (Subtarget->is64Bit()) 179 setSchedulingPreference(Sched::ILP); 180 else 181 setSchedulingPreference(Sched::RegPressure); 182 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); 183 184 // Bypass i32 with i8 on Atom when compiling with O2 185 if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) 186 addBypassSlowDiv(32, 8); 187 188 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 189 // Setup Windows compiler runtime calls. 190 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 191 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 192 setLibcallName(RTLIB::SREM_I64, "_allrem"); 193 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 194 setLibcallName(RTLIB::MUL_I64, "_allmul"); 195 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 196 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 197 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 198 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 199 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 200 201 // The _ftol2 runtime function has an unusual calling conv, which 202 // is modeled by a special pseudo-instruction. 203 setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); 204 setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); 205 setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); 206 setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); 207 } 208 209 if (Subtarget->isTargetDarwin()) { 210 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 211 setUseUnderscoreSetJmp(false); 212 setUseUnderscoreLongJmp(false); 213 } else if (Subtarget->isTargetMingw()) { 214 // MS runtime is weird: it exports _setjmp, but longjmp! 215 setUseUnderscoreSetJmp(true); 216 setUseUnderscoreLongJmp(false); 217 } else { 218 setUseUnderscoreSetJmp(true); 219 setUseUnderscoreLongJmp(true); 220 } 221 222 // Set up the register classes. 223 addRegisterClass(MVT::i8, &X86::GR8RegClass); 224 addRegisterClass(MVT::i16, &X86::GR16RegClass); 225 addRegisterClass(MVT::i32, &X86::GR32RegClass); 226 if (Subtarget->is64Bit()) 227 addRegisterClass(MVT::i64, &X86::GR64RegClass); 228 229 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 230 231 // We don't accept any truncstore of integer registers. 232 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 233 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 234 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 235 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 236 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 237 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 238 239 // SETOEQ and SETUNE require checking two conditions. 240 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 241 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 242 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 243 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 244 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 245 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 246 247 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 248 // operation. 249 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 250 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 251 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 252 253 if (Subtarget->is64Bit()) { 254 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 255 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 256 } else if (!TM.Options.UseSoftFloat) { 257 // We have an algorithm for SSE2->double, and we turn this into a 258 // 64-bit FILD followed by conditional FADD for other targets. 259 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 260 // We have an algorithm for SSE2, and we turn this into a 64-bit 261 // FILD for other targets. 262 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 263 } 264 265 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 266 // this operation. 267 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 268 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 269 270 if (!TM.Options.UseSoftFloat) { 271 // SSE has no i16 to fp conversion, only i32 272 if (X86ScalarSSEf32) { 273 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 274 // f32 and f64 cases are Legal, f80 case is not 275 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 276 } else { 277 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 278 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 279 } 280 } else { 281 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 282 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 283 } 284 285 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 286 // are Legal, f80 is custom lowered. 287 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 288 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 289 290 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 291 // this operation. 292 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 293 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 294 295 if (X86ScalarSSEf32) { 296 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 297 // f32 and f64 cases are Legal, f80 case is not 298 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 299 } else { 300 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 301 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 302 } 303 304 // Handle FP_TO_UINT by promoting the destination to a larger signed 305 // conversion. 306 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 307 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 308 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 309 310 if (Subtarget->is64Bit()) { 311 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 312 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 313 } else if (!TM.Options.UseSoftFloat) { 314 // Since AVX is a superset of SSE3, only check for SSE here. 315 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 316 // Expand FP_TO_UINT into a select. 317 // FIXME: We would like to use a Custom expander here eventually to do 318 // the optimal thing for SSE vs. the default expansion in the legalizer. 319 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 320 else 321 // With SSE3 we can use fisttpll to convert to a signed i64; without 322 // SSE, we're stuck with a fistpll. 323 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 324 } 325 326 if (isTargetFTOL()) { 327 // Use the _ftol2 runtime function, which has a pseudo-instruction 328 // to handle its weird calling convention. 329 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 330 } 331 332 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 333 if (!X86ScalarSSEf64) { 334 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 335 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 336 if (Subtarget->is64Bit()) { 337 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 338 // Without SSE, i64->f64 goes through memory. 339 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 340 } 341 } 342 343 // Scalar integer divide and remainder are lowered to use operations that 344 // produce two results, to match the available instructions. This exposes 345 // the two-result form to trivial CSE, which is able to combine x/y and x%y 346 // into a single instruction. 347 // 348 // Scalar integer multiply-high is also lowered to use two-result 349 // operations, to match the available instructions. However, plain multiply 350 // (low) operations are left as Legal, as there are single-result 351 // instructions for this in x86. Using the two-result multiply instructions 352 // when both high and low results are needed must be arranged by dagcombine. 353 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 354 MVT VT = IntVTs[i]; 355 setOperationAction(ISD::MULHS, VT, Expand); 356 setOperationAction(ISD::MULHU, VT, Expand); 357 setOperationAction(ISD::SDIV, VT, Expand); 358 setOperationAction(ISD::UDIV, VT, Expand); 359 setOperationAction(ISD::SREM, VT, Expand); 360 setOperationAction(ISD::UREM, VT, Expand); 361 362 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 363 setOperationAction(ISD::ADDC, VT, Custom); 364 setOperationAction(ISD::ADDE, VT, Custom); 365 setOperationAction(ISD::SUBC, VT, Custom); 366 setOperationAction(ISD::SUBE, VT, Custom); 367 } 368 369 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 370 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 371 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 372 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 373 if (Subtarget->is64Bit()) 374 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 375 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 376 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 377 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 378 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 379 setOperationAction(ISD::FREM , MVT::f32 , Expand); 380 setOperationAction(ISD::FREM , MVT::f64 , Expand); 381 setOperationAction(ISD::FREM , MVT::f80 , Expand); 382 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 383 384 // Promote the i8 variants and force them on up to i32 which has a shorter 385 // encoding. 386 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 387 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 388 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 389 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 390 if (Subtarget->hasBMI()) { 391 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 392 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 393 if (Subtarget->is64Bit()) 394 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 395 } else { 396 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 397 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 398 if (Subtarget->is64Bit()) 399 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 400 } 401 402 if (Subtarget->hasLZCNT()) { 403 // When promoting the i8 variants, force them to i32 for a shorter 404 // encoding. 405 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 406 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 407 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 408 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 409 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 410 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 411 if (Subtarget->is64Bit()) 412 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 413 } else { 414 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 415 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 416 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 417 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 418 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 419 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 420 if (Subtarget->is64Bit()) { 421 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 422 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 423 } 424 } 425 426 if (Subtarget->hasPOPCNT()) { 427 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 428 } else { 429 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 430 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 431 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 432 if (Subtarget->is64Bit()) 433 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 434 } 435 436 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 437 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 438 439 // These should be promoted to a larger select which is supported. 440 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 441 // X86 wants to expand cmov itself. 442 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 443 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 444 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 445 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 446 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 447 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 448 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 449 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 450 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 451 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 452 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 453 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 454 if (Subtarget->is64Bit()) { 455 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 456 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 457 } 458 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 459 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support 460 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 461 // support continuation, user-level threading, and etc.. As a result, no 462 // other SjLj exception interfaces are implemented and please don't build 463 // your own exception handling based on them. 464 // LLVM/Clang supports zero-cost DWARF exception handling. 465 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 466 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 467 468 // Darwin ABI issue. 469 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 470 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 471 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 472 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 473 if (Subtarget->is64Bit()) 474 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 475 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 476 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 477 if (Subtarget->is64Bit()) { 478 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 479 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 480 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 481 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 482 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 483 } 484 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 485 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 486 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 487 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 488 if (Subtarget->is64Bit()) { 489 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 490 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 491 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 492 } 493 494 if (Subtarget->hasSSE1()) 495 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 496 497 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 498 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 499 500 // On X86 and X86-64, atomic operations are lowered to locked instructions. 501 // Locked instructions, in turn, have implicit fence semantics (all memory 502 // operations are flushed before issuing the locked instruction, and they 503 // are not buffered), so we can fold away the common pattern of 504 // fence-atomic-fence. 505 setShouldFoldAtomicFences(true); 506 507 // Expand certain atomics 508 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 509 MVT VT = IntVTs[i]; 510 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 511 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 512 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 513 } 514 515 if (!Subtarget->is64Bit()) { 516 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); 517 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 518 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 519 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 520 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 521 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 522 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 523 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 524 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); 525 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); 526 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); 527 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); 528 } 529 530 if (Subtarget->hasCmpxchg16b()) { 531 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 532 } 533 534 // FIXME - use subtarget debug flags 535 if (!Subtarget->isTargetDarwin() && 536 !Subtarget->isTargetELF() && 537 !Subtarget->isTargetCygMing()) { 538 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 539 } 540 541 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 542 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 543 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 544 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 545 if (Subtarget->is64Bit()) { 546 setExceptionPointerRegister(X86::RAX); 547 setExceptionSelectorRegister(X86::RDX); 548 } else { 549 setExceptionPointerRegister(X86::EAX); 550 setExceptionSelectorRegister(X86::EDX); 551 } 552 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 553 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 554 555 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 556 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 557 558 setOperationAction(ISD::TRAP, MVT::Other, Legal); 559 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 560 561 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 562 setOperationAction(ISD::VASTART , MVT::Other, Custom); 563 setOperationAction(ISD::VAEND , MVT::Other, Expand); 564 if (Subtarget->is64Bit()) { 565 setOperationAction(ISD::VAARG , MVT::Other, Custom); 566 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 567 } else { 568 setOperationAction(ISD::VAARG , MVT::Other, Expand); 569 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 570 } 571 572 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 573 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 574 575 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 576 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 577 MVT::i64 : MVT::i32, Custom); 578 else if (TM.Options.EnableSegmentedStacks) 579 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 580 MVT::i64 : MVT::i32, Custom); 581 else 582 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 583 MVT::i64 : MVT::i32, Expand); 584 585 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 586 // f32 and f64 use SSE. 587 // Set up the FP register classes. 588 addRegisterClass(MVT::f32, &X86::FR32RegClass); 589 addRegisterClass(MVT::f64, &X86::FR64RegClass); 590 591 // Use ANDPD to simulate FABS. 592 setOperationAction(ISD::FABS , MVT::f64, Custom); 593 setOperationAction(ISD::FABS , MVT::f32, Custom); 594 595 // Use XORP to simulate FNEG. 596 setOperationAction(ISD::FNEG , MVT::f64, Custom); 597 setOperationAction(ISD::FNEG , MVT::f32, Custom); 598 599 // Use ANDPD and ORPD to simulate FCOPYSIGN. 600 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 601 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 602 603 // Lower this to FGETSIGNx86 plus an AND. 604 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 605 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 606 607 // We don't support sin/cos/fmod 608 setOperationAction(ISD::FSIN , MVT::f64, Expand); 609 setOperationAction(ISD::FCOS , MVT::f64, Expand); 610 setOperationAction(ISD::FSIN , MVT::f32, Expand); 611 setOperationAction(ISD::FCOS , MVT::f32, Expand); 612 613 // Expand FP immediates into loads from the stack, except for the special 614 // cases we handle. 615 addLegalFPImmediate(APFloat(+0.0)); // xorpd 616 addLegalFPImmediate(APFloat(+0.0f)); // xorps 617 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 618 // Use SSE for f32, x87 for f64. 619 // Set up the FP register classes. 620 addRegisterClass(MVT::f32, &X86::FR32RegClass); 621 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 622 623 // Use ANDPS to simulate FABS. 624 setOperationAction(ISD::FABS , MVT::f32, Custom); 625 626 // Use XORP to simulate FNEG. 627 setOperationAction(ISD::FNEG , MVT::f32, Custom); 628 629 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 630 631 // Use ANDPS and ORPS to simulate FCOPYSIGN. 632 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 633 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 634 635 // We don't support sin/cos/fmod 636 setOperationAction(ISD::FSIN , MVT::f32, Expand); 637 setOperationAction(ISD::FCOS , MVT::f32, Expand); 638 639 // Special cases we handle for FP constants. 640 addLegalFPImmediate(APFloat(+0.0f)); // xorps 641 addLegalFPImmediate(APFloat(+0.0)); // FLD0 642 addLegalFPImmediate(APFloat(+1.0)); // FLD1 643 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 644 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 645 646 if (!TM.Options.UnsafeFPMath) { 647 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 648 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 649 } 650 } else if (!TM.Options.UseSoftFloat) { 651 // f32 and f64 in x87. 652 // Set up the FP register classes. 653 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 654 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 655 656 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 657 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 658 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 659 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 660 661 if (!TM.Options.UnsafeFPMath) { 662 setOperationAction(ISD::FSIN , MVT::f32 , Expand); 663 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 664 setOperationAction(ISD::FCOS , MVT::f32 , Expand); 665 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 666 } 667 addLegalFPImmediate(APFloat(+0.0)); // FLD0 668 addLegalFPImmediate(APFloat(+1.0)); // FLD1 669 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 670 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 671 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 672 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 673 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 674 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 675 } 676 677 // We don't support FMA. 678 setOperationAction(ISD::FMA, MVT::f64, Expand); 679 setOperationAction(ISD::FMA, MVT::f32, Expand); 680 681 // Long double always uses X87. 682 if (!TM.Options.UseSoftFloat) { 683 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 684 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 685 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 686 { 687 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 688 addLegalFPImmediate(TmpFlt); // FLD0 689 TmpFlt.changeSign(); 690 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 691 692 bool ignored; 693 APFloat TmpFlt2(+1.0); 694 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 695 &ignored); 696 addLegalFPImmediate(TmpFlt2); // FLD1 697 TmpFlt2.changeSign(); 698 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 699 } 700 701 if (!TM.Options.UnsafeFPMath) { 702 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 703 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 704 } 705 706 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 707 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 708 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 709 setOperationAction(ISD::FRINT, MVT::f80, Expand); 710 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 711 setOperationAction(ISD::FMA, MVT::f80, Expand); 712 } 713 714 // Always use a library call for pow. 715 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 716 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 717 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 718 719 setOperationAction(ISD::FLOG, MVT::f80, Expand); 720 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 721 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 722 setOperationAction(ISD::FEXP, MVT::f80, Expand); 723 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 724 725 // First set operation action for all vector types to either promote 726 // (for widening) or expand (for scalarization). Then we will selectively 727 // turn on ones that can be effectively codegen'd. 728 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 729 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 730 MVT VT = (MVT::SimpleValueType)i; 731 setOperationAction(ISD::ADD , VT, Expand); 732 setOperationAction(ISD::SUB , VT, Expand); 733 setOperationAction(ISD::FADD, VT, Expand); 734 setOperationAction(ISD::FNEG, VT, Expand); 735 setOperationAction(ISD::FSUB, VT, Expand); 736 setOperationAction(ISD::MUL , VT, Expand); 737 setOperationAction(ISD::FMUL, VT, Expand); 738 setOperationAction(ISD::SDIV, VT, Expand); 739 setOperationAction(ISD::UDIV, VT, Expand); 740 setOperationAction(ISD::FDIV, VT, Expand); 741 setOperationAction(ISD::SREM, VT, Expand); 742 setOperationAction(ISD::UREM, VT, Expand); 743 setOperationAction(ISD::LOAD, VT, Expand); 744 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 745 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); 746 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 747 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); 748 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); 749 setOperationAction(ISD::FABS, VT, Expand); 750 setOperationAction(ISD::FSIN, VT, Expand); 751 setOperationAction(ISD::FCOS, VT, Expand); 752 setOperationAction(ISD::FREM, VT, Expand); 753 setOperationAction(ISD::FMA, VT, Expand); 754 setOperationAction(ISD::FPOWI, VT, Expand); 755 setOperationAction(ISD::FSQRT, VT, Expand); 756 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 757 setOperationAction(ISD::FFLOOR, VT, Expand); 758 setOperationAction(ISD::FCEIL, VT, Expand); 759 setOperationAction(ISD::FTRUNC, VT, Expand); 760 setOperationAction(ISD::FRINT, VT, Expand); 761 setOperationAction(ISD::FNEARBYINT, VT, Expand); 762 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 763 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 764 setOperationAction(ISD::SDIVREM, VT, Expand); 765 setOperationAction(ISD::UDIVREM, VT, Expand); 766 setOperationAction(ISD::FPOW, VT, Expand); 767 setOperationAction(ISD::CTPOP, VT, Expand); 768 setOperationAction(ISD::CTTZ, VT, Expand); 769 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 770 setOperationAction(ISD::CTLZ, VT, Expand); 771 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 772 setOperationAction(ISD::SHL, VT, Expand); 773 setOperationAction(ISD::SRA, VT, Expand); 774 setOperationAction(ISD::SRL, VT, Expand); 775 setOperationAction(ISD::ROTL, VT, Expand); 776 setOperationAction(ISD::ROTR, VT, Expand); 777 setOperationAction(ISD::BSWAP, VT, Expand); 778 setOperationAction(ISD::SETCC, VT, Expand); 779 setOperationAction(ISD::FLOG, VT, Expand); 780 setOperationAction(ISD::FLOG2, VT, Expand); 781 setOperationAction(ISD::FLOG10, VT, Expand); 782 setOperationAction(ISD::FEXP, VT, Expand); 783 setOperationAction(ISD::FEXP2, VT, Expand); 784 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 785 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 786 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 787 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 788 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); 789 setOperationAction(ISD::TRUNCATE, VT, Expand); 790 setOperationAction(ISD::SIGN_EXTEND, VT, Expand); 791 setOperationAction(ISD::ZERO_EXTEND, VT, Expand); 792 setOperationAction(ISD::ANY_EXTEND, VT, Expand); 793 setOperationAction(ISD::VSELECT, VT, Expand); 794 for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; 795 InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 796 setTruncStoreAction(VT, 797 (MVT::SimpleValueType)InnerVT, Expand); 798 setLoadExtAction(ISD::SEXTLOAD, VT, Expand); 799 setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); 800 setLoadExtAction(ISD::EXTLOAD, VT, Expand); 801 } 802 803 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 804 // with -msoft-float, disable use of MMX as well. 805 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 806 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 807 // No operations on x86mmx supported, everything uses intrinsics. 808 } 809 810 // MMX-sized vectors (other than x86mmx) are expected to be expanded 811 // into smaller operations. 812 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 813 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 814 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 815 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 816 setOperationAction(ISD::AND, MVT::v8i8, Expand); 817 setOperationAction(ISD::AND, MVT::v4i16, Expand); 818 setOperationAction(ISD::AND, MVT::v2i32, Expand); 819 setOperationAction(ISD::AND, MVT::v1i64, Expand); 820 setOperationAction(ISD::OR, MVT::v8i8, Expand); 821 setOperationAction(ISD::OR, MVT::v4i16, Expand); 822 setOperationAction(ISD::OR, MVT::v2i32, Expand); 823 setOperationAction(ISD::OR, MVT::v1i64, Expand); 824 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 825 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 826 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 827 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 828 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 829 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 830 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 831 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 832 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 833 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 834 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 835 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 836 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 837 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 838 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 839 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 840 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 841 842 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { 843 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 844 845 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 846 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 847 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 848 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 849 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 850 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 851 setOperationAction(ISD::FABS, MVT::v4f32, Custom); 852 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 853 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 854 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 855 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 856 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 857 } 858 859 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { 860 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 861 862 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 863 // registers cannot be used even for integer operations. 864 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 865 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 866 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 867 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 868 869 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 870 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 871 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 872 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 873 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 874 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 875 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 876 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 877 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 878 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 879 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 880 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 881 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 882 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 883 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 884 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 885 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 886 setOperationAction(ISD::FABS, MVT::v2f64, Custom); 887 888 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 889 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 890 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 891 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 892 893 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 894 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 895 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 896 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 897 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 898 899 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 900 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 901 MVT VT = (MVT::SimpleValueType)i; 902 // Do not attempt to custom lower non-power-of-2 vectors 903 if (!isPowerOf2_32(VT.getVectorNumElements())) 904 continue; 905 // Do not attempt to custom lower non-128-bit vectors 906 if (!VT.is128BitVector()) 907 continue; 908 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 909 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 910 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 911 } 912 913 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 914 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 915 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 916 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 917 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 918 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 919 920 if (Subtarget->is64Bit()) { 921 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 922 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 923 } 924 925 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 926 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 927 MVT VT = (MVT::SimpleValueType)i; 928 929 // Do not attempt to promote non-128-bit vectors 930 if (!VT.is128BitVector()) 931 continue; 932 933 setOperationAction(ISD::AND, VT, Promote); 934 AddPromotedToType (ISD::AND, VT, MVT::v2i64); 935 setOperationAction(ISD::OR, VT, Promote); 936 AddPromotedToType (ISD::OR, VT, MVT::v2i64); 937 setOperationAction(ISD::XOR, VT, Promote); 938 AddPromotedToType (ISD::XOR, VT, MVT::v2i64); 939 setOperationAction(ISD::LOAD, VT, Promote); 940 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); 941 setOperationAction(ISD::SELECT, VT, Promote); 942 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); 943 } 944 945 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 946 947 // Custom lower v2i64 and v2f64 selects. 948 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 949 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 950 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 951 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 952 953 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 954 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 955 956 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); 957 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 958 // As there is no 64-bit GPR available, we need build a special custom 959 // sequence to convert from v2i32 to v2f32. 960 if (!Subtarget->is64Bit()) 961 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); 962 963 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); 964 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); 965 966 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); 967 } 968 969 if (Subtarget->hasSSE41()) { 970 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 971 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 972 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 973 setOperationAction(ISD::FRINT, MVT::f32, Legal); 974 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 975 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 976 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 977 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 978 setOperationAction(ISD::FRINT, MVT::f64, Legal); 979 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 980 981 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 982 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 983 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 984 setOperationAction(ISD::FRINT, MVT::v4f32, Legal); 985 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 986 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 987 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 988 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 989 setOperationAction(ISD::FRINT, MVT::v2f64, Legal); 990 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 991 992 // FIXME: Do we need to handle scalar-to-vector here? 993 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 994 995 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 996 setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); 997 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 998 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 999 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 1000 1001 // i8 and i16 vectors are custom , because the source register and source 1002 // source memory operand types are not the same width. f32 vectors are 1003 // custom since the immediate controlling the insert encodes additional 1004 // information. 1005 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 1006 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 1007 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 1008 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 1009 1010 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 1011 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 1012 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 1013 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 1014 1015 // FIXME: these should be Legal but thats only for the case where 1016 // the index is constant. For now custom expand to deal with that. 1017 if (Subtarget->is64Bit()) { 1018 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 1019 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 1020 } 1021 } 1022 1023 if (Subtarget->hasSSE2()) { 1024 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 1025 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 1026 1027 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 1028 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 1029 1030 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 1031 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 1032 1033 if (Subtarget->hasInt256()) { 1034 setOperationAction(ISD::SRL, MVT::v2i64, Legal); 1035 setOperationAction(ISD::SRL, MVT::v4i32, Legal); 1036 1037 setOperationAction(ISD::SHL, MVT::v2i64, Legal); 1038 setOperationAction(ISD::SHL, MVT::v4i32, Legal); 1039 1040 setOperationAction(ISD::SRA, MVT::v4i32, Legal); 1041 } else { 1042 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 1043 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 1044 1045 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 1046 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 1047 1048 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 1049 } 1050 } 1051 1052 if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { 1053 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 1054 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 1055 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 1056 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 1057 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 1058 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 1059 1060 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1061 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1062 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1063 1064 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1065 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1066 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1067 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1068 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1069 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); 1070 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); 1071 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); 1072 setOperationAction(ISD::FRINT, MVT::v8f32, Legal); 1073 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); 1074 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1075 setOperationAction(ISD::FABS, MVT::v8f32, Custom); 1076 1077 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1078 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1079 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1080 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1081 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1082 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 1083 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 1084 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 1085 setOperationAction(ISD::FRINT, MVT::v4f64, Legal); 1086 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); 1087 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1088 setOperationAction(ISD::FABS, MVT::v4f64, Custom); 1089 1090 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); 1091 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); 1092 1093 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 1094 1095 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1096 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1097 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1098 1099 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 1100 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 1101 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 1102 1103 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); 1104 1105 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1106 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1107 1108 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1109 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1110 1111 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1112 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1113 1114 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1115 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1116 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1117 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1118 1119 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1120 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1121 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1122 1123 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 1124 setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); 1125 setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); 1126 setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); 1127 1128 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { 1129 setOperationAction(ISD::FMA, MVT::v8f32, Legal); 1130 setOperationAction(ISD::FMA, MVT::v4f64, Legal); 1131 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 1132 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 1133 setOperationAction(ISD::FMA, MVT::f32, Legal); 1134 setOperationAction(ISD::FMA, MVT::f64, Legal); 1135 } 1136 1137 if (Subtarget->hasInt256()) { 1138 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1139 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1140 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1141 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1142 1143 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1144 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1145 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1146 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1147 1148 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1149 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1150 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1151 // Don't lower v32i8 because there is no 128-bit byte mul 1152 1153 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1154 1155 setOperationAction(ISD::SRL, MVT::v4i64, Legal); 1156 setOperationAction(ISD::SRL, MVT::v8i32, Legal); 1157 1158 setOperationAction(ISD::SHL, MVT::v4i64, Legal); 1159 setOperationAction(ISD::SHL, MVT::v8i32, Legal); 1160 1161 setOperationAction(ISD::SRA, MVT::v8i32, Legal); 1162 } else { 1163 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1164 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1165 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1166 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1167 1168 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1169 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1170 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1171 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1172 1173 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1174 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1175 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1176 // Don't lower v32i8 because there is no 128-bit byte mul 1177 1178 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1179 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1180 1181 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1182 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1183 1184 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1185 } 1186 1187 // Custom lower several nodes for 256-bit types. 1188 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 1189 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 1190 MVT VT = (MVT::SimpleValueType)i; 1191 1192 // Extract subvector is special because the value type 1193 // (result) is 128-bit but the source is 256-bit wide. 1194 if (VT.is128BitVector()) 1195 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1196 1197 // Do not attempt to custom lower other non-256-bit vectors 1198 if (!VT.is256BitVector()) 1199 continue; 1200 1201 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1202 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1203 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1204 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1205 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1206 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1207 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1208 } 1209 1210 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1211 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { 1212 MVT VT = (MVT::SimpleValueType)i; 1213 1214 // Do not attempt to promote non-256-bit vectors 1215 if (!VT.is256BitVector()) 1216 continue; 1217 1218 setOperationAction(ISD::AND, VT, Promote); 1219 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 1220 setOperationAction(ISD::OR, VT, Promote); 1221 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 1222 setOperationAction(ISD::XOR, VT, Promote); 1223 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 1224 setOperationAction(ISD::LOAD, VT, Promote); 1225 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 1226 setOperationAction(ISD::SELECT, VT, Promote); 1227 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 1228 } 1229 } 1230 1231 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1232 // of this type with custom code. 1233 for (int VT = MVT::FIRST_VECTOR_VALUETYPE; 1234 VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { 1235 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, 1236 Custom); 1237 } 1238 1239 // We want to custom lower some of our intrinsics. 1240 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1241 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1242 1243 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1244 // handle type legalization for these operations here. 1245 // 1246 // FIXME: We really should do custom legalization for addition and 1247 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1248 // than generic legalization for 64-bit multiplication-with-overflow, though. 1249 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1250 // Add/Sub/Mul with overflow operations are custom lowered. 1251 MVT VT = IntVTs[i]; 1252 setOperationAction(ISD::SADDO, VT, Custom); 1253 setOperationAction(ISD::UADDO, VT, Custom); 1254 setOperationAction(ISD::SSUBO, VT, Custom); 1255 setOperationAction(ISD::USUBO, VT, Custom); 1256 setOperationAction(ISD::SMULO, VT, Custom); 1257 setOperationAction(ISD::UMULO, VT, Custom); 1258 } 1259 1260 // There are no 8-bit 3-address imul/mul instructions 1261 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1262 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1263 1264 if (!Subtarget->is64Bit()) { 1265 // These libcalls are not available in 32-bit. 1266 setLibcallName(RTLIB::SHL_I128, 0); 1267 setLibcallName(RTLIB::SRL_I128, 0); 1268 setLibcallName(RTLIB::SRA_I128, 0); 1269 } 1270 1271 // We have target-specific dag combine patterns for the following nodes: 1272 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1273 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1274 setTargetDAGCombine(ISD::VSELECT); 1275 setTargetDAGCombine(ISD::SELECT); 1276 setTargetDAGCombine(ISD::SHL); 1277 setTargetDAGCombine(ISD::SRA); 1278 setTargetDAGCombine(ISD::SRL); 1279 setTargetDAGCombine(ISD::OR); 1280 setTargetDAGCombine(ISD::AND); 1281 setTargetDAGCombine(ISD::ADD); 1282 setTargetDAGCombine(ISD::FADD); 1283 setTargetDAGCombine(ISD::FSUB); 1284 setTargetDAGCombine(ISD::FMA); 1285 setTargetDAGCombine(ISD::SUB); 1286 setTargetDAGCombine(ISD::LOAD); 1287 setTargetDAGCombine(ISD::STORE); 1288 setTargetDAGCombine(ISD::ZERO_EXTEND); 1289 setTargetDAGCombine(ISD::ANY_EXTEND); 1290 setTargetDAGCombine(ISD::SIGN_EXTEND); 1291 setTargetDAGCombine(ISD::TRUNCATE); 1292 setTargetDAGCombine(ISD::SINT_TO_FP); 1293 setTargetDAGCombine(ISD::SETCC); 1294 if (Subtarget->is64Bit()) 1295 setTargetDAGCombine(ISD::MUL); 1296 setTargetDAGCombine(ISD::XOR); 1297 1298 computeRegisterProperties(); 1299 1300 // On Darwin, -Os means optimize for size without hurting performance, 1301 // do not reduce the limit. 1302 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1303 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1304 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1305 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1306 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1307 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1308 setPrefLoopAlignment(4); // 2^4 bytes. 1309 benefitFromCodePlacementOpt = true; 1310 1311 // Predictable cmov don't hurt on atom because it's in-order. 1312 predictableSelectIsExpensive = !Subtarget->isAtom(); 1313 1314 setPrefFunctionAlignment(4); // 2^4 bytes. 1315} 1316 1317EVT X86TargetLowering::getSetCCResultType(EVT VT) const { 1318 if (!VT.isVector()) return MVT::i8; 1319 return VT.changeVectorElementTypeToInteger(); 1320} 1321 1322/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1323/// the desired ByVal argument alignment. 1324static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1325 if (MaxAlign == 16) 1326 return; 1327 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1328 if (VTy->getBitWidth() == 128) 1329 MaxAlign = 16; 1330 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1331 unsigned EltAlign = 0; 1332 getMaxByValAlign(ATy->getElementType(), EltAlign); 1333 if (EltAlign > MaxAlign) 1334 MaxAlign = EltAlign; 1335 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1336 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1337 unsigned EltAlign = 0; 1338 getMaxByValAlign(STy->getElementType(i), EltAlign); 1339 if (EltAlign > MaxAlign) 1340 MaxAlign = EltAlign; 1341 if (MaxAlign == 16) 1342 break; 1343 } 1344 } 1345} 1346 1347/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1348/// function arguments in the caller parameter area. For X86, aggregates 1349/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1350/// are at 4-byte boundaries. 1351unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1352 if (Subtarget->is64Bit()) { 1353 // Max of 8 and alignment of type. 1354 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1355 if (TyAlign > 8) 1356 return TyAlign; 1357 return 8; 1358 } 1359 1360 unsigned Align = 4; 1361 if (Subtarget->hasSSE1()) 1362 getMaxByValAlign(Ty, Align); 1363 return Align; 1364} 1365 1366/// getOptimalMemOpType - Returns the target specific optimal type for load 1367/// and store operations as a result of memset, memcpy, and memmove 1368/// lowering. If DstAlign is zero that means it's safe to destination 1369/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1370/// means there isn't a need to check it against alignment requirement, 1371/// probably because the source does not need to be loaded. If 'IsMemset' is 1372/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 1373/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 1374/// source is constant so it does not need to be loaded. 1375/// It returns EVT::Other if the type should be determined using generic 1376/// target-independent logic. 1377EVT 1378X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1379 unsigned DstAlign, unsigned SrcAlign, 1380 bool IsMemset, bool ZeroMemset, 1381 bool MemcpyStrSrc, 1382 MachineFunction &MF) const { 1383 const Function *F = MF.getFunction(); 1384 if ((!IsMemset || ZeroMemset) && 1385 !F->getFnAttributes().hasAttribute(Attribute::NoImplicitFloat)) { 1386 if (Size >= 16 && 1387 (Subtarget->isUnalignedMemAccessFast() || 1388 ((DstAlign == 0 || DstAlign >= 16) && 1389 (SrcAlign == 0 || SrcAlign >= 16)))) { 1390 if (Size >= 32) { 1391 if (Subtarget->hasInt256()) 1392 return MVT::v8i32; 1393 if (Subtarget->hasFp256()) 1394 return MVT::v8f32; 1395 } 1396 if (Subtarget->hasSSE2()) 1397 return MVT::v4i32; 1398 if (Subtarget->hasSSE1()) 1399 return MVT::v4f32; 1400 } else if (!MemcpyStrSrc && Size >= 8 && 1401 !Subtarget->is64Bit() && 1402 Subtarget->hasSSE2()) { 1403 // Do not use f64 to lower memcpy if source is string constant. It's 1404 // better to use i32 to avoid the loads. 1405 return MVT::f64; 1406 } 1407 } 1408 if (Subtarget->is64Bit() && Size >= 8) 1409 return MVT::i64; 1410 return MVT::i32; 1411} 1412 1413bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 1414 if (VT == MVT::f32) 1415 return X86ScalarSSEf32; 1416 else if (VT == MVT::f64) 1417 return X86ScalarSSEf64; 1418 return true; 1419} 1420 1421bool 1422X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { 1423 if (Fast) 1424 *Fast = Subtarget->isUnalignedMemAccessFast(); 1425 return true; 1426} 1427 1428/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1429/// current function. The returned value is a member of the 1430/// MachineJumpTableInfo::JTEntryKind enum. 1431unsigned X86TargetLowering::getJumpTableEncoding() const { 1432 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1433 // symbol. 1434 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1435 Subtarget->isPICStyleGOT()) 1436 return MachineJumpTableInfo::EK_Custom32; 1437 1438 // Otherwise, use the normal jump table encoding heuristics. 1439 return TargetLowering::getJumpTableEncoding(); 1440} 1441 1442const MCExpr * 1443X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1444 const MachineBasicBlock *MBB, 1445 unsigned uid,MCContext &Ctx) const{ 1446 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1447 Subtarget->isPICStyleGOT()); 1448 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1449 // entries. 1450 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1451 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1452} 1453 1454/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1455/// jumptable. 1456SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1457 SelectionDAG &DAG) const { 1458 if (!Subtarget->is64Bit()) 1459 // This doesn't have DebugLoc associated with it, but is not really the 1460 // same as a Register. 1461 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1462 return Table; 1463} 1464 1465/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1466/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1467/// MCExpr. 1468const MCExpr *X86TargetLowering:: 1469getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1470 MCContext &Ctx) const { 1471 // X86-64 uses RIP relative addressing based on the jump table label. 1472 if (Subtarget->isPICStyleRIPRel()) 1473 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1474 1475 // Otherwise, the reference is relative to the PIC base. 1476 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1477} 1478 1479// FIXME: Why this routine is here? Move to RegInfo! 1480std::pair<const TargetRegisterClass*, uint8_t> 1481X86TargetLowering::findRepresentativeClass(MVT VT) const{ 1482 const TargetRegisterClass *RRC = 0; 1483 uint8_t Cost = 1; 1484 switch (VT.SimpleTy) { 1485 default: 1486 return TargetLowering::findRepresentativeClass(VT); 1487 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1488 RRC = Subtarget->is64Bit() ? 1489 (const TargetRegisterClass*)&X86::GR64RegClass : 1490 (const TargetRegisterClass*)&X86::GR32RegClass; 1491 break; 1492 case MVT::x86mmx: 1493 RRC = &X86::VR64RegClass; 1494 break; 1495 case MVT::f32: case MVT::f64: 1496 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1497 case MVT::v4f32: case MVT::v2f64: 1498 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1499 case MVT::v4f64: 1500 RRC = &X86::VR128RegClass; 1501 break; 1502 } 1503 return std::make_pair(RRC, Cost); 1504} 1505 1506bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1507 unsigned &Offset) const { 1508 if (!Subtarget->isTargetLinux()) 1509 return false; 1510 1511 if (Subtarget->is64Bit()) { 1512 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1513 Offset = 0x28; 1514 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1515 AddressSpace = 256; 1516 else 1517 AddressSpace = 257; 1518 } else { 1519 // %gs:0x14 on i386 1520 Offset = 0x14; 1521 AddressSpace = 256; 1522 } 1523 return true; 1524} 1525 1526//===----------------------------------------------------------------------===// 1527// Return Value Calling Convention Implementation 1528//===----------------------------------------------------------------------===// 1529 1530#include "X86GenCallingConv.inc" 1531 1532bool 1533X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1534 MachineFunction &MF, bool isVarArg, 1535 const SmallVectorImpl<ISD::OutputArg> &Outs, 1536 LLVMContext &Context) const { 1537 SmallVector<CCValAssign, 16> RVLocs; 1538 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1539 RVLocs, Context); 1540 return CCInfo.CheckReturn(Outs, RetCC_X86); 1541} 1542 1543SDValue 1544X86TargetLowering::LowerReturn(SDValue Chain, 1545 CallingConv::ID CallConv, bool isVarArg, 1546 const SmallVectorImpl<ISD::OutputArg> &Outs, 1547 const SmallVectorImpl<SDValue> &OutVals, 1548 DebugLoc dl, SelectionDAG &DAG) const { 1549 MachineFunction &MF = DAG.getMachineFunction(); 1550 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1551 1552 SmallVector<CCValAssign, 16> RVLocs; 1553 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1554 RVLocs, *DAG.getContext()); 1555 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1556 1557 // Add the regs to the liveout set for the function. 1558 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1559 for (unsigned i = 0; i != RVLocs.size(); ++i) 1560 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1561 MRI.addLiveOut(RVLocs[i].getLocReg()); 1562 1563 SDValue Flag; 1564 1565 SmallVector<SDValue, 6> RetOps; 1566 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1567 // Operand #1 = Bytes To Pop 1568 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1569 MVT::i16)); 1570 1571 // Copy the result values into the output registers. 1572 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1573 CCValAssign &VA = RVLocs[i]; 1574 assert(VA.isRegLoc() && "Can only return in registers!"); 1575 SDValue ValToCopy = OutVals[i]; 1576 EVT ValVT = ValToCopy.getValueType(); 1577 1578 // Promote values to the appropriate types 1579 if (VA.getLocInfo() == CCValAssign::SExt) 1580 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 1581 else if (VA.getLocInfo() == CCValAssign::ZExt) 1582 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 1583 else if (VA.getLocInfo() == CCValAssign::AExt) 1584 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 1585 else if (VA.getLocInfo() == CCValAssign::BCvt) 1586 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); 1587 1588 // If this is x86-64, and we disabled SSE, we can't return FP values, 1589 // or SSE or MMX vectors. 1590 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1591 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1592 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1593 report_fatal_error("SSE register return with SSE disabled"); 1594 } 1595 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1596 // llvm-gcc has never done it right and no one has noticed, so this 1597 // should be OK for now. 1598 if (ValVT == MVT::f64 && 1599 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1600 report_fatal_error("SSE2 register return with SSE2 disabled"); 1601 1602 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1603 // the RET instruction and handled by the FP Stackifier. 1604 if (VA.getLocReg() == X86::ST0 || 1605 VA.getLocReg() == X86::ST1) { 1606 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1607 // change the value to the FP stack register class. 1608 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1609 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1610 RetOps.push_back(ValToCopy); 1611 // Don't emit a copytoreg. 1612 continue; 1613 } 1614 1615 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1616 // which is returned in RAX / RDX. 1617 if (Subtarget->is64Bit()) { 1618 if (ValVT == MVT::x86mmx) { 1619 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1620 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1621 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1622 ValToCopy); 1623 // If we don't have SSE2 available, convert to v4f32 so the generated 1624 // register is legal. 1625 if (!Subtarget->hasSSE2()) 1626 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1627 } 1628 } 1629 } 1630 1631 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1632 Flag = Chain.getValue(1); 1633 } 1634 1635 // The x86-64 ABI for returning structs by value requires that we copy 1636 // the sret argument into %rax for the return. We saved the argument into 1637 // a virtual register in the entry block, so now we copy the value out 1638 // and into %rax. 1639 if (Subtarget->is64Bit() && 1640 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1641 MachineFunction &MF = DAG.getMachineFunction(); 1642 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1643 unsigned Reg = FuncInfo->getSRetReturnReg(); 1644 assert(Reg && 1645 "SRetReturnReg should have been set in LowerFormalArguments()."); 1646 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1647 1648 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1649 Flag = Chain.getValue(1); 1650 1651 // RAX now acts like a return value. 1652 MRI.addLiveOut(X86::RAX); 1653 } 1654 1655 RetOps[0] = Chain; // Update chain. 1656 1657 // Add the flag if we have it. 1658 if (Flag.getNode()) 1659 RetOps.push_back(Flag); 1660 1661 return DAG.getNode(X86ISD::RET_FLAG, dl, 1662 MVT::Other, &RetOps[0], RetOps.size()); 1663} 1664 1665bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 1666 if (N->getNumValues() != 1) 1667 return false; 1668 if (!N->hasNUsesOfValue(1, 0)) 1669 return false; 1670 1671 SDValue TCChain = Chain; 1672 SDNode *Copy = *N->use_begin(); 1673 if (Copy->getOpcode() == ISD::CopyToReg) { 1674 // If the copy has a glue operand, we conservatively assume it isn't safe to 1675 // perform a tail call. 1676 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 1677 return false; 1678 TCChain = Copy->getOperand(0); 1679 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 1680 return false; 1681 1682 bool HasRet = false; 1683 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1684 UI != UE; ++UI) { 1685 if (UI->getOpcode() != X86ISD::RET_FLAG) 1686 return false; 1687 HasRet = true; 1688 } 1689 1690 if (!HasRet) 1691 return false; 1692 1693 Chain = TCChain; 1694 return true; 1695} 1696 1697MVT 1698X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, 1699 ISD::NodeType ExtendKind) const { 1700 MVT ReturnMVT; 1701 // TODO: Is this also valid on 32-bit? 1702 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1703 ReturnMVT = MVT::i8; 1704 else 1705 ReturnMVT = MVT::i32; 1706 1707 MVT MinVT = getRegisterType(ReturnMVT); 1708 return VT.bitsLT(MinVT) ? MinVT : VT; 1709} 1710 1711/// LowerCallResult - Lower the result values of a call into the 1712/// appropriate copies out of appropriate physical registers. 1713/// 1714SDValue 1715X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1716 CallingConv::ID CallConv, bool isVarArg, 1717 const SmallVectorImpl<ISD::InputArg> &Ins, 1718 DebugLoc dl, SelectionDAG &DAG, 1719 SmallVectorImpl<SDValue> &InVals) const { 1720 1721 // Assign locations to each value returned by this call. 1722 SmallVector<CCValAssign, 16> RVLocs; 1723 bool Is64Bit = Subtarget->is64Bit(); 1724 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1725 getTargetMachine(), RVLocs, *DAG.getContext()); 1726 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1727 1728 // Copy all of the result registers out of their specified physreg. 1729 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1730 CCValAssign &VA = RVLocs[i]; 1731 EVT CopyVT = VA.getValVT(); 1732 1733 // If this is x86-64, and we disabled SSE, we can't return FP values 1734 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1735 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1736 report_fatal_error("SSE register return with SSE disabled"); 1737 } 1738 1739 SDValue Val; 1740 1741 // If this is a call to a function that returns an fp value on the floating 1742 // point stack, we must guarantee the value is popped from the stack, so 1743 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1744 // if the return value is not used. We use the FpPOP_RETVAL instruction 1745 // instead. 1746 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1747 // If we prefer to use the value in xmm registers, copy it out as f80 and 1748 // use a truncate to move it from fp stack reg to xmm reg. 1749 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1750 SDValue Ops[] = { Chain, InFlag }; 1751 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1752 MVT::Other, MVT::Glue, Ops, 2), 1); 1753 Val = Chain.getValue(0); 1754 1755 // Round the f80 to the right size, which also moves it to the appropriate 1756 // xmm register. 1757 if (CopyVT != VA.getValVT()) 1758 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1759 // This truncation won't change the value. 1760 DAG.getIntPtrConstant(1)); 1761 } else { 1762 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1763 CopyVT, InFlag).getValue(1); 1764 Val = Chain.getValue(0); 1765 } 1766 InFlag = Chain.getValue(2); 1767 InVals.push_back(Val); 1768 } 1769 1770 return Chain; 1771} 1772 1773//===----------------------------------------------------------------------===// 1774// C & StdCall & Fast Calling Convention implementation 1775//===----------------------------------------------------------------------===// 1776// StdCall calling convention seems to be standard for many Windows' API 1777// routines and around. It differs from C calling convention just a little: 1778// callee should clean up the stack, not caller. Symbols should be also 1779// decorated in some fancy way :) It doesn't support any vector arguments. 1780// For info on fast calling convention see Fast Calling Convention (tail call) 1781// implementation LowerX86_32FastCCCallTo. 1782 1783/// CallIsStructReturn - Determines whether a call uses struct return 1784/// semantics. 1785enum StructReturnType { 1786 NotStructReturn, 1787 RegStructReturn, 1788 StackStructReturn 1789}; 1790static StructReturnType 1791callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1792 if (Outs.empty()) 1793 return NotStructReturn; 1794 1795 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 1796 if (!Flags.isSRet()) 1797 return NotStructReturn; 1798 if (Flags.isInReg()) 1799 return RegStructReturn; 1800 return StackStructReturn; 1801} 1802 1803/// ArgsAreStructReturn - Determines whether a function uses struct 1804/// return semantics. 1805static StructReturnType 1806argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1807 if (Ins.empty()) 1808 return NotStructReturn; 1809 1810 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 1811 if (!Flags.isSRet()) 1812 return NotStructReturn; 1813 if (Flags.isInReg()) 1814 return RegStructReturn; 1815 return StackStructReturn; 1816} 1817 1818/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1819/// by "Src" to address "Dst" with size and alignment information specified by 1820/// the specific parameter attribute. The copy will be passed as a byval 1821/// function parameter. 1822static SDValue 1823CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1824 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1825 DebugLoc dl) { 1826 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1827 1828 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1829 /*isVolatile*/false, /*AlwaysInline=*/true, 1830 MachinePointerInfo(), MachinePointerInfo()); 1831} 1832 1833/// IsTailCallConvention - Return true if the calling convention is one that 1834/// supports tail call optimization. 1835static bool IsTailCallConvention(CallingConv::ID CC) { 1836 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 1837 CC == CallingConv::HiPE); 1838} 1839 1840bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1841 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) 1842 return false; 1843 1844 CallSite CS(CI); 1845 CallingConv::ID CalleeCC = CS.getCallingConv(); 1846 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1847 return false; 1848 1849 return true; 1850} 1851 1852/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1853/// a tailcall target by changing its ABI. 1854static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 1855 bool GuaranteedTailCallOpt) { 1856 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1857} 1858 1859SDValue 1860X86TargetLowering::LowerMemArgument(SDValue Chain, 1861 CallingConv::ID CallConv, 1862 const SmallVectorImpl<ISD::InputArg> &Ins, 1863 DebugLoc dl, SelectionDAG &DAG, 1864 const CCValAssign &VA, 1865 MachineFrameInfo *MFI, 1866 unsigned i) const { 1867 // Create the nodes corresponding to a load from this parameter slot. 1868 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1869 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, 1870 getTargetMachine().Options.GuaranteedTailCallOpt); 1871 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1872 EVT ValVT; 1873 1874 // If value is passed by pointer we have address passed instead of the value 1875 // itself. 1876 if (VA.getLocInfo() == CCValAssign::Indirect) 1877 ValVT = VA.getLocVT(); 1878 else 1879 ValVT = VA.getValVT(); 1880 1881 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1882 // changed with more analysis. 1883 // In case of tail call optimization mark all arguments mutable. Since they 1884 // could be overwritten by lowering of arguments in case of a tail call. 1885 if (Flags.isByVal()) { 1886 unsigned Bytes = Flags.getByValSize(); 1887 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1888 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1889 return DAG.getFrameIndex(FI, getPointerTy()); 1890 } else { 1891 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1892 VA.getLocMemOffset(), isImmutable); 1893 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1894 return DAG.getLoad(ValVT, dl, Chain, FIN, 1895 MachinePointerInfo::getFixedStack(FI), 1896 false, false, false, 0); 1897 } 1898} 1899 1900SDValue 1901X86TargetLowering::LowerFormalArguments(SDValue Chain, 1902 CallingConv::ID CallConv, 1903 bool isVarArg, 1904 const SmallVectorImpl<ISD::InputArg> &Ins, 1905 DebugLoc dl, 1906 SelectionDAG &DAG, 1907 SmallVectorImpl<SDValue> &InVals) 1908 const { 1909 MachineFunction &MF = DAG.getMachineFunction(); 1910 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1911 1912 const Function* Fn = MF.getFunction(); 1913 if (Fn->hasExternalLinkage() && 1914 Subtarget->isTargetCygMing() && 1915 Fn->getName() == "main") 1916 FuncInfo->setForceFramePointer(true); 1917 1918 MachineFrameInfo *MFI = MF.getFrameInfo(); 1919 bool Is64Bit = Subtarget->is64Bit(); 1920 bool IsWindows = Subtarget->isTargetWindows(); 1921 bool IsWin64 = Subtarget->isTargetWin64(); 1922 1923 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1924 "Var args not supported with calling convention fastcc, ghc or hipe"); 1925 1926 // Assign locations to all of the incoming arguments. 1927 SmallVector<CCValAssign, 16> ArgLocs; 1928 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1929 ArgLocs, *DAG.getContext()); 1930 1931 // Allocate shadow area for Win64 1932 if (IsWin64) { 1933 CCInfo.AllocateStack(32, 8); 1934 } 1935 1936 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1937 1938 unsigned LastVal = ~0U; 1939 SDValue ArgValue; 1940 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1941 CCValAssign &VA = ArgLocs[i]; 1942 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1943 // places. 1944 assert(VA.getValNo() != LastVal && 1945 "Don't support value assigned to multiple locs yet"); 1946 (void)LastVal; 1947 LastVal = VA.getValNo(); 1948 1949 if (VA.isRegLoc()) { 1950 EVT RegVT = VA.getLocVT(); 1951 const TargetRegisterClass *RC; 1952 if (RegVT == MVT::i32) 1953 RC = &X86::GR32RegClass; 1954 else if (Is64Bit && RegVT == MVT::i64) 1955 RC = &X86::GR64RegClass; 1956 else if (RegVT == MVT::f32) 1957 RC = &X86::FR32RegClass; 1958 else if (RegVT == MVT::f64) 1959 RC = &X86::FR64RegClass; 1960 else if (RegVT.is256BitVector()) 1961 RC = &X86::VR256RegClass; 1962 else if (RegVT.is128BitVector()) 1963 RC = &X86::VR128RegClass; 1964 else if (RegVT == MVT::x86mmx) 1965 RC = &X86::VR64RegClass; 1966 else 1967 llvm_unreachable("Unknown argument type!"); 1968 1969 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1970 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1971 1972 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1973 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1974 // right size. 1975 if (VA.getLocInfo() == CCValAssign::SExt) 1976 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1977 DAG.getValueType(VA.getValVT())); 1978 else if (VA.getLocInfo() == CCValAssign::ZExt) 1979 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1980 DAG.getValueType(VA.getValVT())); 1981 else if (VA.getLocInfo() == CCValAssign::BCvt) 1982 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1983 1984 if (VA.isExtInLoc()) { 1985 // Handle MMX values passed in XMM regs. 1986 if (RegVT.isVector()) { 1987 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1988 ArgValue); 1989 } else 1990 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1991 } 1992 } else { 1993 assert(VA.isMemLoc()); 1994 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1995 } 1996 1997 // If value is passed via pointer - do a load. 1998 if (VA.getLocInfo() == CCValAssign::Indirect) 1999 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 2000 MachinePointerInfo(), false, false, false, 0); 2001 2002 InVals.push_back(ArgValue); 2003 } 2004 2005 // The x86-64 ABI for returning structs by value requires that we copy 2006 // the sret argument into %rax for the return. Save the argument into 2007 // a virtual register so that we can access it from the return points. 2008 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 2009 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2010 unsigned Reg = FuncInfo->getSRetReturnReg(); 2011 if (!Reg) { 2012 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 2013 FuncInfo->setSRetReturnReg(Reg); 2014 } 2015 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 2016 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 2017 } 2018 2019 unsigned StackSize = CCInfo.getNextStackOffset(); 2020 // Align stack specially for tail calls. 2021 if (FuncIsMadeTailCallSafe(CallConv, 2022 MF.getTarget().Options.GuaranteedTailCallOpt)) 2023 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 2024 2025 // If the function takes variable number of arguments, make a frame index for 2026 // the start of the first vararg value... for expansion of llvm.va_start. 2027 if (isVarArg) { 2028 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 2029 CallConv != CallingConv::X86_ThisCall)) { 2030 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 2031 } 2032 if (Is64Bit) { 2033 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 2034 2035 // FIXME: We should really autogenerate these arrays 2036 static const uint16_t GPR64ArgRegsWin64[] = { 2037 X86::RCX, X86::RDX, X86::R8, X86::R9 2038 }; 2039 static const uint16_t GPR64ArgRegs64Bit[] = { 2040 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 2041 }; 2042 static const uint16_t XMMArgRegs64Bit[] = { 2043 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2044 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2045 }; 2046 const uint16_t *GPR64ArgRegs; 2047 unsigned NumXMMRegs = 0; 2048 2049 if (IsWin64) { 2050 // The XMM registers which might contain var arg parameters are shadowed 2051 // in their paired GPR. So we only need to save the GPR to their home 2052 // slots. 2053 TotalNumIntRegs = 4; 2054 GPR64ArgRegs = GPR64ArgRegsWin64; 2055 } else { 2056 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 2057 GPR64ArgRegs = GPR64ArgRegs64Bit; 2058 2059 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, 2060 TotalNumXMMRegs); 2061 } 2062 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 2063 TotalNumIntRegs); 2064 2065 bool NoImplicitFloatOps = Fn->getFnAttributes(). 2066 hasAttribute(Attribute::NoImplicitFloat); 2067 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 2068 "SSE register cannot be used when SSE is disabled!"); 2069 assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && 2070 NoImplicitFloatOps) && 2071 "SSE register cannot be used when SSE is disabled!"); 2072 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 2073 !Subtarget->hasSSE1()) 2074 // Kernel mode asks for SSE to be disabled, so don't push them 2075 // on the stack. 2076 TotalNumXMMRegs = 0; 2077 2078 if (IsWin64) { 2079 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 2080 // Get to the caller-allocated home save location. Add 8 to account 2081 // for the return address. 2082 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2083 FuncInfo->setRegSaveFrameIndex( 2084 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 2085 // Fixup to set vararg frame on shadow area (4 x i64). 2086 if (NumIntRegs < 4) 2087 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 2088 } else { 2089 // For X86-64, if there are vararg parameters that are passed via 2090 // registers, then we must store them to their spots on the stack so 2091 // they may be loaded by deferencing the result of va_next. 2092 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 2093 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 2094 FuncInfo->setRegSaveFrameIndex( 2095 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 2096 false)); 2097 } 2098 2099 // Store the integer parameter registers. 2100 SmallVector<SDValue, 8> MemOps; 2101 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 2102 getPointerTy()); 2103 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 2104 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 2105 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 2106 DAG.getIntPtrConstant(Offset)); 2107 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 2108 &X86::GR64RegClass); 2109 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 2110 SDValue Store = 2111 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2112 MachinePointerInfo::getFixedStack( 2113 FuncInfo->getRegSaveFrameIndex(), Offset), 2114 false, false, 0); 2115 MemOps.push_back(Store); 2116 Offset += 8; 2117 } 2118 2119 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 2120 // Now store the XMM (fp + vector) parameter registers. 2121 SmallVector<SDValue, 11> SaveXMMOps; 2122 SaveXMMOps.push_back(Chain); 2123 2124 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2125 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 2126 SaveXMMOps.push_back(ALVal); 2127 2128 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2129 FuncInfo->getRegSaveFrameIndex())); 2130 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2131 FuncInfo->getVarArgsFPOffset())); 2132 2133 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 2134 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 2135 &X86::VR128RegClass); 2136 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 2137 SaveXMMOps.push_back(Val); 2138 } 2139 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2140 MVT::Other, 2141 &SaveXMMOps[0], SaveXMMOps.size())); 2142 } 2143 2144 if (!MemOps.empty()) 2145 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2146 &MemOps[0], MemOps.size()); 2147 } 2148 } 2149 2150 // Some CCs need callee pop. 2151 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2152 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2153 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2154 } else { 2155 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2156 // If this is an sret function, the return should pop the hidden pointer. 2157 if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2158 argsAreStructReturn(Ins) == StackStructReturn) 2159 FuncInfo->setBytesToPopOnReturn(4); 2160 } 2161 2162 if (!Is64Bit) { 2163 // RegSaveFrameIndex is X86-64 only. 2164 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2165 if (CallConv == CallingConv::X86_FastCall || 2166 CallConv == CallingConv::X86_ThisCall) 2167 // fastcc functions can't have varargs. 2168 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2169 } 2170 2171 FuncInfo->setArgumentStackSize(StackSize); 2172 2173 return Chain; 2174} 2175 2176SDValue 2177X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2178 SDValue StackPtr, SDValue Arg, 2179 DebugLoc dl, SelectionDAG &DAG, 2180 const CCValAssign &VA, 2181 ISD::ArgFlagsTy Flags) const { 2182 unsigned LocMemOffset = VA.getLocMemOffset(); 2183 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 2184 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 2185 if (Flags.isByVal()) 2186 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2187 2188 return DAG.getStore(Chain, dl, Arg, PtrOff, 2189 MachinePointerInfo::getStack(LocMemOffset), 2190 false, false, 0); 2191} 2192 2193/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 2194/// optimization is performed and it is required. 2195SDValue 2196X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 2197 SDValue &OutRetAddr, SDValue Chain, 2198 bool IsTailCall, bool Is64Bit, 2199 int FPDiff, DebugLoc dl) const { 2200 // Adjust the Return address stack slot. 2201 EVT VT = getPointerTy(); 2202 OutRetAddr = getReturnAddressFrameIndex(DAG); 2203 2204 // Load the "old" Return address. 2205 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2206 false, false, false, 0); 2207 return SDValue(OutRetAddr.getNode(), 1); 2208} 2209 2210/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 2211/// optimization is performed and it is required (FPDiff!=0). 2212static SDValue 2213EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 2214 SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, 2215 unsigned SlotSize, int FPDiff, DebugLoc dl) { 2216 // Store the return address to the appropriate stack slot. 2217 if (!FPDiff) return Chain; 2218 // Calculate the new stack slot for the return address. 2219 int NewReturnAddrFI = 2220 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 2221 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 2222 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2223 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 2224 false, false, 0); 2225 return Chain; 2226} 2227 2228SDValue 2229X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2230 SmallVectorImpl<SDValue> &InVals) const { 2231 SelectionDAG &DAG = CLI.DAG; 2232 DebugLoc &dl = CLI.DL; 2233 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2234 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2235 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2236 SDValue Chain = CLI.Chain; 2237 SDValue Callee = CLI.Callee; 2238 CallingConv::ID CallConv = CLI.CallConv; 2239 bool &isTailCall = CLI.IsTailCall; 2240 bool isVarArg = CLI.IsVarArg; 2241 2242 MachineFunction &MF = DAG.getMachineFunction(); 2243 bool Is64Bit = Subtarget->is64Bit(); 2244 bool IsWin64 = Subtarget->isTargetWin64(); 2245 bool IsWindows = Subtarget->isTargetWindows(); 2246 StructReturnType SR = callIsStructReturn(Outs); 2247 bool IsSibcall = false; 2248 2249 if (MF.getTarget().Options.DisableTailCalls) 2250 isTailCall = false; 2251 2252 if (isTailCall) { 2253 // Check if it's really possible to do a tail call. 2254 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2255 isVarArg, SR != NotStructReturn, 2256 MF.getFunction()->hasStructRetAttr(), CLI.RetTy, 2257 Outs, OutVals, Ins, DAG); 2258 2259 // Sibcalls are automatically detected tailcalls which do not require 2260 // ABI changes. 2261 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 2262 IsSibcall = true; 2263 2264 if (isTailCall) 2265 ++NumTailCalls; 2266 } 2267 2268 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2269 "Var args not supported with calling convention fastcc, ghc or hipe"); 2270 2271 // Analyze operands of the call, assigning locations to each operand. 2272 SmallVector<CCValAssign, 16> ArgLocs; 2273 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2274 ArgLocs, *DAG.getContext()); 2275 2276 // Allocate shadow area for Win64 2277 if (IsWin64) { 2278 CCInfo.AllocateStack(32, 8); 2279 } 2280 2281 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2282 2283 // Get a count of how many bytes are to be pushed on the stack. 2284 unsigned NumBytes = CCInfo.getNextStackOffset(); 2285 if (IsSibcall) 2286 // This is a sibcall. The memory operands are available in caller's 2287 // own caller's stack. 2288 NumBytes = 0; 2289 else if (getTargetMachine().Options.GuaranteedTailCallOpt && 2290 IsTailCallConvention(CallConv)) 2291 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2292 2293 int FPDiff = 0; 2294 if (isTailCall && !IsSibcall) { 2295 // Lower arguments at fp - stackoffset + fpdiff. 2296 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 2297 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 2298 2299 FPDiff = NumBytesCallerPushed - NumBytes; 2300 2301 // Set the delta of movement of the returnaddr stackslot. 2302 // But only set if delta is greater than previous delta. 2303 if (FPDiff < X86Info->getTCReturnAddrDelta()) 2304 X86Info->setTCReturnAddrDelta(FPDiff); 2305 } 2306 2307 if (!IsSibcall) 2308 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2309 2310 SDValue RetAddrFrIdx; 2311 // Load return address for tail calls. 2312 if (isTailCall && FPDiff) 2313 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2314 Is64Bit, FPDiff, dl); 2315 2316 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2317 SmallVector<SDValue, 8> MemOpChains; 2318 SDValue StackPtr; 2319 2320 // Walk the register/memloc assignments, inserting copies/loads. In the case 2321 // of tail call optimization arguments are handle later. 2322 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2323 CCValAssign &VA = ArgLocs[i]; 2324 EVT RegVT = VA.getLocVT(); 2325 SDValue Arg = OutVals[i]; 2326 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2327 bool isByVal = Flags.isByVal(); 2328 2329 // Promote the value if needed. 2330 switch (VA.getLocInfo()) { 2331 default: llvm_unreachable("Unknown loc info!"); 2332 case CCValAssign::Full: break; 2333 case CCValAssign::SExt: 2334 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2335 break; 2336 case CCValAssign::ZExt: 2337 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2338 break; 2339 case CCValAssign::AExt: 2340 if (RegVT.is128BitVector()) { 2341 // Special case: passing MMX values in XMM registers. 2342 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2343 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2344 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2345 } else 2346 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2347 break; 2348 case CCValAssign::BCvt: 2349 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2350 break; 2351 case CCValAssign::Indirect: { 2352 // Store the argument. 2353 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2354 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2355 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2356 MachinePointerInfo::getFixedStack(FI), 2357 false, false, 0); 2358 Arg = SpillSlot; 2359 break; 2360 } 2361 } 2362 2363 if (VA.isRegLoc()) { 2364 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2365 if (isVarArg && IsWin64) { 2366 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2367 // shadow reg if callee is a varargs function. 2368 unsigned ShadowReg = 0; 2369 switch (VA.getLocReg()) { 2370 case X86::XMM0: ShadowReg = X86::RCX; break; 2371 case X86::XMM1: ShadowReg = X86::RDX; break; 2372 case X86::XMM2: ShadowReg = X86::R8; break; 2373 case X86::XMM3: ShadowReg = X86::R9; break; 2374 } 2375 if (ShadowReg) 2376 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2377 } 2378 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2379 assert(VA.isMemLoc()); 2380 if (StackPtr.getNode() == 0) 2381 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 2382 getPointerTy()); 2383 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2384 dl, DAG, VA, Flags)); 2385 } 2386 } 2387 2388 if (!MemOpChains.empty()) 2389 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2390 &MemOpChains[0], MemOpChains.size()); 2391 2392 if (Subtarget->isPICStyleGOT()) { 2393 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2394 // GOT pointer. 2395 if (!isTailCall) { 2396 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), 2397 DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()))); 2398 } else { 2399 // If we are tail calling and generating PIC/GOT style code load the 2400 // address of the callee into ECX. The value in ecx is used as target of 2401 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2402 // for tail calls on PIC/GOT architectures. Normally we would just put the 2403 // address of GOT into ebx and then call target@PLT. But for tail calls 2404 // ebx would be restored (since ebx is callee saved) before jumping to the 2405 // target@PLT. 2406 2407 // Note: The actual moving to ECX is done further down. 2408 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2409 if (G && !G->getGlobal()->hasHiddenVisibility() && 2410 !G->getGlobal()->hasProtectedVisibility()) 2411 Callee = LowerGlobalAddress(Callee, DAG); 2412 else if (isa<ExternalSymbolSDNode>(Callee)) 2413 Callee = LowerExternalSymbol(Callee, DAG); 2414 } 2415 } 2416 2417 if (Is64Bit && isVarArg && !IsWin64) { 2418 // From AMD64 ABI document: 2419 // For calls that may call functions that use varargs or stdargs 2420 // (prototype-less calls or calls to functions containing ellipsis (...) in 2421 // the declaration) %al is used as hidden argument to specify the number 2422 // of SSE registers used. The contents of %al do not need to match exactly 2423 // the number of registers, but must be an ubound on the number of SSE 2424 // registers used and is in the range 0 - 8 inclusive. 2425 2426 // Count the number of XMM registers allocated. 2427 static const uint16_t XMMArgRegs[] = { 2428 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2429 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2430 }; 2431 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2432 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2433 && "SSE registers cannot be used when SSE is disabled"); 2434 2435 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 2436 DAG.getConstant(NumXMMRegs, MVT::i8))); 2437 } 2438 2439 // For tail calls lower the arguments to the 'real' stack slot. 2440 if (isTailCall) { 2441 // Force all the incoming stack arguments to be loaded from the stack 2442 // before any new outgoing arguments are stored to the stack, because the 2443 // outgoing stack slots may alias the incoming argument stack slots, and 2444 // the alias isn't otherwise explicit. This is slightly more conservative 2445 // than necessary, because it means that each store effectively depends 2446 // on every argument instead of just those arguments it would clobber. 2447 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2448 2449 SmallVector<SDValue, 8> MemOpChains2; 2450 SDValue FIN; 2451 int FI = 0; 2452 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2453 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2454 CCValAssign &VA = ArgLocs[i]; 2455 if (VA.isRegLoc()) 2456 continue; 2457 assert(VA.isMemLoc()); 2458 SDValue Arg = OutVals[i]; 2459 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2460 // Create frame index. 2461 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2462 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2463 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2464 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2465 2466 if (Flags.isByVal()) { 2467 // Copy relative to framepointer. 2468 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2469 if (StackPtr.getNode() == 0) 2470 StackPtr = DAG.getCopyFromReg(Chain, dl, 2471 RegInfo->getStackRegister(), 2472 getPointerTy()); 2473 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2474 2475 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2476 ArgChain, 2477 Flags, DAG, dl)); 2478 } else { 2479 // Store relative to framepointer. 2480 MemOpChains2.push_back( 2481 DAG.getStore(ArgChain, dl, Arg, FIN, 2482 MachinePointerInfo::getFixedStack(FI), 2483 false, false, 0)); 2484 } 2485 } 2486 } 2487 2488 if (!MemOpChains2.empty()) 2489 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2490 &MemOpChains2[0], MemOpChains2.size()); 2491 2492 // Store the return address to the appropriate stack slot. 2493 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 2494 getPointerTy(), RegInfo->getSlotSize(), 2495 FPDiff, dl); 2496 } 2497 2498 // Build a sequence of copy-to-reg nodes chained together with token chain 2499 // and flag operands which copy the outgoing args into registers. 2500 SDValue InFlag; 2501 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2502 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2503 RegsToPass[i].second, InFlag); 2504 InFlag = Chain.getValue(1); 2505 } 2506 2507 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2508 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2509 // In the 64-bit large code model, we have to make all calls 2510 // through a register, since the call instruction's 32-bit 2511 // pc-relative offset may not be large enough to hold the whole 2512 // address. 2513 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2514 // If the callee is a GlobalAddress node (quite common, every direct call 2515 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2516 // it. 2517 2518 // We should use extra load for direct calls to dllimported functions in 2519 // non-JIT mode. 2520 const GlobalValue *GV = G->getGlobal(); 2521 if (!GV->hasDLLImportLinkage()) { 2522 unsigned char OpFlags = 0; 2523 bool ExtraLoad = false; 2524 unsigned WrapperKind = ISD::DELETED_NODE; 2525 2526 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2527 // external symbols most go through the PLT in PIC mode. If the symbol 2528 // has hidden or protected visibility, or if it is static or local, then 2529 // we don't need to use the PLT - we can directly call it. 2530 if (Subtarget->isTargetELF() && 2531 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2532 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2533 OpFlags = X86II::MO_PLT; 2534 } else if (Subtarget->isPICStyleStubAny() && 2535 (GV->isDeclaration() || GV->isWeakForLinker()) && 2536 (!Subtarget->getTargetTriple().isMacOSX() || 2537 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2538 // PC-relative references to external symbols should go through $stub, 2539 // unless we're building with the leopard linker or later, which 2540 // automatically synthesizes these stubs. 2541 OpFlags = X86II::MO_DARWIN_STUB; 2542 } else if (Subtarget->isPICStyleRIPRel() && 2543 isa<Function>(GV) && 2544 cast<Function>(GV)->getFnAttributes(). 2545 hasAttribute(Attribute::NonLazyBind)) { 2546 // If the function is marked as non-lazy, generate an indirect call 2547 // which loads from the GOT directly. This avoids runtime overhead 2548 // at the cost of eager binding (and one extra byte of encoding). 2549 OpFlags = X86II::MO_GOTPCREL; 2550 WrapperKind = X86ISD::WrapperRIP; 2551 ExtraLoad = true; 2552 } 2553 2554 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2555 G->getOffset(), OpFlags); 2556 2557 // Add a wrapper if needed. 2558 if (WrapperKind != ISD::DELETED_NODE) 2559 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2560 // Add extra indirection if needed. 2561 if (ExtraLoad) 2562 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2563 MachinePointerInfo::getGOT(), 2564 false, false, false, 0); 2565 } 2566 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2567 unsigned char OpFlags = 0; 2568 2569 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2570 // external symbols should go through the PLT. 2571 if (Subtarget->isTargetELF() && 2572 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2573 OpFlags = X86II::MO_PLT; 2574 } else if (Subtarget->isPICStyleStubAny() && 2575 (!Subtarget->getTargetTriple().isMacOSX() || 2576 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2577 // PC-relative references to external symbols should go through $stub, 2578 // unless we're building with the leopard linker or later, which 2579 // automatically synthesizes these stubs. 2580 OpFlags = X86II::MO_DARWIN_STUB; 2581 } 2582 2583 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2584 OpFlags); 2585 } 2586 2587 // Returns a chain & a flag for retval copy to use. 2588 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2589 SmallVector<SDValue, 8> Ops; 2590 2591 if (!IsSibcall && isTailCall) { 2592 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2593 DAG.getIntPtrConstant(0, true), InFlag); 2594 InFlag = Chain.getValue(1); 2595 } 2596 2597 Ops.push_back(Chain); 2598 Ops.push_back(Callee); 2599 2600 if (isTailCall) 2601 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2602 2603 // Add argument registers to the end of the list so that they are known live 2604 // into the call. 2605 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2606 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2607 RegsToPass[i].second.getValueType())); 2608 2609 // Add a register mask operand representing the call-preserved registers. 2610 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 2611 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 2612 assert(Mask && "Missing call preserved mask for calling convention"); 2613 Ops.push_back(DAG.getRegisterMask(Mask)); 2614 2615 if (InFlag.getNode()) 2616 Ops.push_back(InFlag); 2617 2618 if (isTailCall) { 2619 // We used to do: 2620 //// If this is the first return lowered for this function, add the regs 2621 //// to the liveout set for the function. 2622 // This isn't right, although it's probably harmless on x86; liveouts 2623 // should be computed from returns not tail calls. Consider a void 2624 // function making a tail call to a function returning int. 2625 return DAG.getNode(X86ISD::TC_RETURN, dl, 2626 NodeTys, &Ops[0], Ops.size()); 2627 } 2628 2629 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2630 InFlag = Chain.getValue(1); 2631 2632 // Create the CALLSEQ_END node. 2633 unsigned NumBytesForCalleeToPush; 2634 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2635 getTargetMachine().Options.GuaranteedTailCallOpt)) 2636 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2637 else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2638 SR == StackStructReturn) 2639 // If this is a call to a struct-return function, the callee 2640 // pops the hidden struct pointer, so we have to push it back. 2641 // This is common for Darwin/X86, Linux & Mingw32 targets. 2642 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 2643 NumBytesForCalleeToPush = 4; 2644 else 2645 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2646 2647 // Returns a flag for retval copy to use. 2648 if (!IsSibcall) { 2649 Chain = DAG.getCALLSEQ_END(Chain, 2650 DAG.getIntPtrConstant(NumBytes, true), 2651 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2652 true), 2653 InFlag); 2654 InFlag = Chain.getValue(1); 2655 } 2656 2657 // Handle result values, copying them out of physregs into vregs that we 2658 // return. 2659 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2660 Ins, dl, DAG, InVals); 2661} 2662 2663//===----------------------------------------------------------------------===// 2664// Fast Calling Convention (tail call) implementation 2665//===----------------------------------------------------------------------===// 2666 2667// Like std call, callee cleans arguments, convention except that ECX is 2668// reserved for storing the tail called function address. Only 2 registers are 2669// free for argument passing (inreg). Tail call optimization is performed 2670// provided: 2671// * tailcallopt is enabled 2672// * caller/callee are fastcc 2673// On X86_64 architecture with GOT-style position independent code only local 2674// (within module) calls are supported at the moment. 2675// To keep the stack aligned according to platform abi the function 2676// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2677// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2678// If a tail called function callee has more arguments than the caller the 2679// caller needs to make sure that there is room to move the RETADDR to. This is 2680// achieved by reserving an area the size of the argument delta right after the 2681// original REtADDR, but before the saved framepointer or the spilled registers 2682// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2683// stack layout: 2684// arg1 2685// arg2 2686// RETADDR 2687// [ new RETADDR 2688// move area ] 2689// (possible EBP) 2690// ESI 2691// EDI 2692// local1 .. 2693 2694/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2695/// for a 16 byte align requirement. 2696unsigned 2697X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2698 SelectionDAG& DAG) const { 2699 MachineFunction &MF = DAG.getMachineFunction(); 2700 const TargetMachine &TM = MF.getTarget(); 2701 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2702 unsigned StackAlignment = TFI.getStackAlignment(); 2703 uint64_t AlignMask = StackAlignment - 1; 2704 int64_t Offset = StackSize; 2705 unsigned SlotSize = RegInfo->getSlotSize(); 2706 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2707 // Number smaller than 12 so just add the difference. 2708 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2709 } else { 2710 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2711 Offset = ((~AlignMask) & Offset) + StackAlignment + 2712 (StackAlignment-SlotSize); 2713 } 2714 return Offset; 2715} 2716 2717/// MatchingStackOffset - Return true if the given stack call argument is 2718/// already available in the same position (relatively) of the caller's 2719/// incoming argument stack. 2720static 2721bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2722 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2723 const X86InstrInfo *TII) { 2724 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2725 int FI = INT_MAX; 2726 if (Arg.getOpcode() == ISD::CopyFromReg) { 2727 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2728 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2729 return false; 2730 MachineInstr *Def = MRI->getVRegDef(VR); 2731 if (!Def) 2732 return false; 2733 if (!Flags.isByVal()) { 2734 if (!TII->isLoadFromStackSlot(Def, FI)) 2735 return false; 2736 } else { 2737 unsigned Opcode = Def->getOpcode(); 2738 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2739 Def->getOperand(1).isFI()) { 2740 FI = Def->getOperand(1).getIndex(); 2741 Bytes = Flags.getByValSize(); 2742 } else 2743 return false; 2744 } 2745 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2746 if (Flags.isByVal()) 2747 // ByVal argument is passed in as a pointer but it's now being 2748 // dereferenced. e.g. 2749 // define @foo(%struct.X* %A) { 2750 // tail call @bar(%struct.X* byval %A) 2751 // } 2752 return false; 2753 SDValue Ptr = Ld->getBasePtr(); 2754 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2755 if (!FINode) 2756 return false; 2757 FI = FINode->getIndex(); 2758 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2759 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2760 FI = FINode->getIndex(); 2761 Bytes = Flags.getByValSize(); 2762 } else 2763 return false; 2764 2765 assert(FI != INT_MAX); 2766 if (!MFI->isFixedObjectIndex(FI)) 2767 return false; 2768 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2769} 2770 2771/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2772/// for tail call optimization. Targets which want to do tail call 2773/// optimization should implement this function. 2774bool 2775X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2776 CallingConv::ID CalleeCC, 2777 bool isVarArg, 2778 bool isCalleeStructRet, 2779 bool isCallerStructRet, 2780 Type *RetTy, 2781 const SmallVectorImpl<ISD::OutputArg> &Outs, 2782 const SmallVectorImpl<SDValue> &OutVals, 2783 const SmallVectorImpl<ISD::InputArg> &Ins, 2784 SelectionDAG& DAG) const { 2785 if (!IsTailCallConvention(CalleeCC) && 2786 CalleeCC != CallingConv::C) 2787 return false; 2788 2789 // If -tailcallopt is specified, make fastcc functions tail-callable. 2790 const MachineFunction &MF = DAG.getMachineFunction(); 2791 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2792 2793 // If the function return type is x86_fp80 and the callee return type is not, 2794 // then the FP_EXTEND of the call result is not a nop. It's not safe to 2795 // perform a tailcall optimization here. 2796 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 2797 return false; 2798 2799 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2800 bool CCMatch = CallerCC == CalleeCC; 2801 2802 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2803 if (IsTailCallConvention(CalleeCC) && CCMatch) 2804 return true; 2805 return false; 2806 } 2807 2808 // Look for obvious safe cases to perform tail call optimization that do not 2809 // require ABI changes. This is what gcc calls sibcall. 2810 2811 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2812 // emit a special epilogue. 2813 if (RegInfo->needsStackRealignment(MF)) 2814 return false; 2815 2816 // Also avoid sibcall optimization if either caller or callee uses struct 2817 // return semantics. 2818 if (isCalleeStructRet || isCallerStructRet) 2819 return false; 2820 2821 // An stdcall caller is expected to clean up its arguments; the callee 2822 // isn't going to do that. 2823 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2824 return false; 2825 2826 // Do not sibcall optimize vararg calls unless all arguments are passed via 2827 // registers. 2828 if (isVarArg && !Outs.empty()) { 2829 2830 // Optimizing for varargs on Win64 is unlikely to be safe without 2831 // additional testing. 2832 if (Subtarget->isTargetWin64()) 2833 return false; 2834 2835 SmallVector<CCValAssign, 16> ArgLocs; 2836 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2837 getTargetMachine(), ArgLocs, *DAG.getContext()); 2838 2839 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2840 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2841 if (!ArgLocs[i].isRegLoc()) 2842 return false; 2843 } 2844 2845 // If the call result is in ST0 / ST1, it needs to be popped off the x87 2846 // stack. Therefore, if it's not used by the call it is not safe to optimize 2847 // this into a sibcall. 2848 bool Unused = false; 2849 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2850 if (!Ins[i].Used) { 2851 Unused = true; 2852 break; 2853 } 2854 } 2855 if (Unused) { 2856 SmallVector<CCValAssign, 16> RVLocs; 2857 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2858 getTargetMachine(), RVLocs, *DAG.getContext()); 2859 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2860 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2861 CCValAssign &VA = RVLocs[i]; 2862 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2863 return false; 2864 } 2865 } 2866 2867 // If the calling conventions do not match, then we'd better make sure the 2868 // results are returned in the same way as what the caller expects. 2869 if (!CCMatch) { 2870 SmallVector<CCValAssign, 16> RVLocs1; 2871 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2872 getTargetMachine(), RVLocs1, *DAG.getContext()); 2873 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2874 2875 SmallVector<CCValAssign, 16> RVLocs2; 2876 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2877 getTargetMachine(), RVLocs2, *DAG.getContext()); 2878 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2879 2880 if (RVLocs1.size() != RVLocs2.size()) 2881 return false; 2882 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2883 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2884 return false; 2885 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2886 return false; 2887 if (RVLocs1[i].isRegLoc()) { 2888 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2889 return false; 2890 } else { 2891 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2892 return false; 2893 } 2894 } 2895 } 2896 2897 // If the callee takes no arguments then go on to check the results of the 2898 // call. 2899 if (!Outs.empty()) { 2900 // Check if stack adjustment is needed. For now, do not do this if any 2901 // argument is passed on the stack. 2902 SmallVector<CCValAssign, 16> ArgLocs; 2903 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2904 getTargetMachine(), ArgLocs, *DAG.getContext()); 2905 2906 // Allocate shadow area for Win64 2907 if (Subtarget->isTargetWin64()) { 2908 CCInfo.AllocateStack(32, 8); 2909 } 2910 2911 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2912 if (CCInfo.getNextStackOffset()) { 2913 MachineFunction &MF = DAG.getMachineFunction(); 2914 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2915 return false; 2916 2917 // Check if the arguments are already laid out in the right way as 2918 // the caller's fixed stack objects. 2919 MachineFrameInfo *MFI = MF.getFrameInfo(); 2920 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2921 const X86InstrInfo *TII = 2922 ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2923 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2924 CCValAssign &VA = ArgLocs[i]; 2925 SDValue Arg = OutVals[i]; 2926 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2927 if (VA.getLocInfo() == CCValAssign::Indirect) 2928 return false; 2929 if (!VA.isRegLoc()) { 2930 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2931 MFI, MRI, TII)) 2932 return false; 2933 } 2934 } 2935 } 2936 2937 // If the tailcall address may be in a register, then make sure it's 2938 // possible to register allocate for it. In 32-bit, the call address can 2939 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2940 // callee-saved registers are restored. These happen to be the same 2941 // registers used to pass 'inreg' arguments so watch out for those. 2942 if (!Subtarget->is64Bit() && 2943 !isa<GlobalAddressSDNode>(Callee) && 2944 !isa<ExternalSymbolSDNode>(Callee)) { 2945 unsigned NumInRegs = 0; 2946 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2947 CCValAssign &VA = ArgLocs[i]; 2948 if (!VA.isRegLoc()) 2949 continue; 2950 unsigned Reg = VA.getLocReg(); 2951 switch (Reg) { 2952 default: break; 2953 case X86::EAX: case X86::EDX: case X86::ECX: 2954 if (++NumInRegs == 3) 2955 return false; 2956 break; 2957 } 2958 } 2959 } 2960 } 2961 2962 return true; 2963} 2964 2965FastISel * 2966X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 2967 const TargetLibraryInfo *libInfo) const { 2968 return X86::createFastISel(funcInfo, libInfo); 2969} 2970 2971//===----------------------------------------------------------------------===// 2972// Other Lowering Hooks 2973//===----------------------------------------------------------------------===// 2974 2975static bool MayFoldLoad(SDValue Op) { 2976 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2977} 2978 2979static bool MayFoldIntoStore(SDValue Op) { 2980 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2981} 2982 2983static bool isTargetShuffle(unsigned Opcode) { 2984 switch(Opcode) { 2985 default: return false; 2986 case X86ISD::PSHUFD: 2987 case X86ISD::PSHUFHW: 2988 case X86ISD::PSHUFLW: 2989 case X86ISD::SHUFP: 2990 case X86ISD::PALIGN: 2991 case X86ISD::MOVLHPS: 2992 case X86ISD::MOVLHPD: 2993 case X86ISD::MOVHLPS: 2994 case X86ISD::MOVLPS: 2995 case X86ISD::MOVLPD: 2996 case X86ISD::MOVSHDUP: 2997 case X86ISD::MOVSLDUP: 2998 case X86ISD::MOVDDUP: 2999 case X86ISD::MOVSS: 3000 case X86ISD::MOVSD: 3001 case X86ISD::UNPCKL: 3002 case X86ISD::UNPCKH: 3003 case X86ISD::VPERMILP: 3004 case X86ISD::VPERM2X128: 3005 case X86ISD::VPERMI: 3006 return true; 3007 } 3008} 3009 3010static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 3011 SDValue V1, SelectionDAG &DAG) { 3012 switch(Opc) { 3013 default: llvm_unreachable("Unknown x86 shuffle node"); 3014 case X86ISD::MOVSHDUP: 3015 case X86ISD::MOVSLDUP: 3016 case X86ISD::MOVDDUP: 3017 return DAG.getNode(Opc, dl, VT, V1); 3018 } 3019} 3020 3021static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 3022 SDValue V1, unsigned TargetMask, 3023 SelectionDAG &DAG) { 3024 switch(Opc) { 3025 default: llvm_unreachable("Unknown x86 shuffle node"); 3026 case X86ISD::PSHUFD: 3027 case X86ISD::PSHUFHW: 3028 case X86ISD::PSHUFLW: 3029 case X86ISD::VPERMILP: 3030 case X86ISD::VPERMI: 3031 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 3032 } 3033} 3034 3035static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 3036 SDValue V1, SDValue V2, unsigned TargetMask, 3037 SelectionDAG &DAG) { 3038 switch(Opc) { 3039 default: llvm_unreachable("Unknown x86 shuffle node"); 3040 case X86ISD::PALIGN: 3041 case X86ISD::SHUFP: 3042 case X86ISD::VPERM2X128: 3043 return DAG.getNode(Opc, dl, VT, V1, V2, 3044 DAG.getConstant(TargetMask, MVT::i8)); 3045 } 3046} 3047 3048static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 3049 SDValue V1, SDValue V2, SelectionDAG &DAG) { 3050 switch(Opc) { 3051 default: llvm_unreachable("Unknown x86 shuffle node"); 3052 case X86ISD::MOVLHPS: 3053 case X86ISD::MOVLHPD: 3054 case X86ISD::MOVHLPS: 3055 case X86ISD::MOVLPS: 3056 case X86ISD::MOVLPD: 3057 case X86ISD::MOVSS: 3058 case X86ISD::MOVSD: 3059 case X86ISD::UNPCKL: 3060 case X86ISD::UNPCKH: 3061 return DAG.getNode(Opc, dl, VT, V1, V2); 3062 } 3063} 3064 3065SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 3066 MachineFunction &MF = DAG.getMachineFunction(); 3067 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 3068 int ReturnAddrIndex = FuncInfo->getRAIndex(); 3069 3070 if (ReturnAddrIndex == 0) { 3071 // Set up a frame object for the return address. 3072 unsigned SlotSize = RegInfo->getSlotSize(); 3073 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 3074 false); 3075 FuncInfo->setRAIndex(ReturnAddrIndex); 3076 } 3077 3078 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 3079} 3080 3081bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 3082 bool hasSymbolicDisplacement) { 3083 // Offset should fit into 32 bit immediate field. 3084 if (!isInt<32>(Offset)) 3085 return false; 3086 3087 // If we don't have a symbolic displacement - we don't have any extra 3088 // restrictions. 3089 if (!hasSymbolicDisplacement) 3090 return true; 3091 3092 // FIXME: Some tweaks might be needed for medium code model. 3093 if (M != CodeModel::Small && M != CodeModel::Kernel) 3094 return false; 3095 3096 // For small code model we assume that latest object is 16MB before end of 31 3097 // bits boundary. We may also accept pretty large negative constants knowing 3098 // that all objects are in the positive half of address space. 3099 if (M == CodeModel::Small && Offset < 16*1024*1024) 3100 return true; 3101 3102 // For kernel code model we know that all object resist in the negative half 3103 // of 32bits address space. We may not accept negative offsets, since they may 3104 // be just off and we may accept pretty large positive ones. 3105 if (M == CodeModel::Kernel && Offset > 0) 3106 return true; 3107 3108 return false; 3109} 3110 3111/// isCalleePop - Determines whether the callee is required to pop its 3112/// own arguments. Callee pop is necessary to support tail calls. 3113bool X86::isCalleePop(CallingConv::ID CallingConv, 3114 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 3115 if (IsVarArg) 3116 return false; 3117 3118 switch (CallingConv) { 3119 default: 3120 return false; 3121 case CallingConv::X86_StdCall: 3122 return !is64Bit; 3123 case CallingConv::X86_FastCall: 3124 return !is64Bit; 3125 case CallingConv::X86_ThisCall: 3126 return !is64Bit; 3127 case CallingConv::Fast: 3128 return TailCallOpt; 3129 case CallingConv::GHC: 3130 return TailCallOpt; 3131 case CallingConv::HiPE: 3132 return TailCallOpt; 3133 } 3134} 3135 3136/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 3137/// specific condition code, returning the condition code and the LHS/RHS of the 3138/// comparison to make. 3139static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 3140 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 3141 if (!isFP) { 3142 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3143 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3144 // X > -1 -> X == 0, jump !sign. 3145 RHS = DAG.getConstant(0, RHS.getValueType()); 3146 return X86::COND_NS; 3147 } 3148 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3149 // X < 0 -> X == 0, jump on sign. 3150 return X86::COND_S; 3151 } 3152 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3153 // X < 1 -> X <= 0 3154 RHS = DAG.getConstant(0, RHS.getValueType()); 3155 return X86::COND_LE; 3156 } 3157 } 3158 3159 switch (SetCCOpcode) { 3160 default: llvm_unreachable("Invalid integer condition!"); 3161 case ISD::SETEQ: return X86::COND_E; 3162 case ISD::SETGT: return X86::COND_G; 3163 case ISD::SETGE: return X86::COND_GE; 3164 case ISD::SETLT: return X86::COND_L; 3165 case ISD::SETLE: return X86::COND_LE; 3166 case ISD::SETNE: return X86::COND_NE; 3167 case ISD::SETULT: return X86::COND_B; 3168 case ISD::SETUGT: return X86::COND_A; 3169 case ISD::SETULE: return X86::COND_BE; 3170 case ISD::SETUGE: return X86::COND_AE; 3171 } 3172 } 3173 3174 // First determine if it is required or is profitable to flip the operands. 3175 3176 // If LHS is a foldable load, but RHS is not, flip the condition. 3177 if (ISD::isNON_EXTLoad(LHS.getNode()) && 3178 !ISD::isNON_EXTLoad(RHS.getNode())) { 3179 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 3180 std::swap(LHS, RHS); 3181 } 3182 3183 switch (SetCCOpcode) { 3184 default: break; 3185 case ISD::SETOLT: 3186 case ISD::SETOLE: 3187 case ISD::SETUGT: 3188 case ISD::SETUGE: 3189 std::swap(LHS, RHS); 3190 break; 3191 } 3192 3193 // On a floating point condition, the flags are set as follows: 3194 // ZF PF CF op 3195 // 0 | 0 | 0 | X > Y 3196 // 0 | 0 | 1 | X < Y 3197 // 1 | 0 | 0 | X == Y 3198 // 1 | 1 | 1 | unordered 3199 switch (SetCCOpcode) { 3200 default: llvm_unreachable("Condcode should be pre-legalized away"); 3201 case ISD::SETUEQ: 3202 case ISD::SETEQ: return X86::COND_E; 3203 case ISD::SETOLT: // flipped 3204 case ISD::SETOGT: 3205 case ISD::SETGT: return X86::COND_A; 3206 case ISD::SETOLE: // flipped 3207 case ISD::SETOGE: 3208 case ISD::SETGE: return X86::COND_AE; 3209 case ISD::SETUGT: // flipped 3210 case ISD::SETULT: 3211 case ISD::SETLT: return X86::COND_B; 3212 case ISD::SETUGE: // flipped 3213 case ISD::SETULE: 3214 case ISD::SETLE: return X86::COND_BE; 3215 case ISD::SETONE: 3216 case ISD::SETNE: return X86::COND_NE; 3217 case ISD::SETUO: return X86::COND_P; 3218 case ISD::SETO: return X86::COND_NP; 3219 case ISD::SETOEQ: 3220 case ISD::SETUNE: return X86::COND_INVALID; 3221 } 3222} 3223 3224/// hasFPCMov - is there a floating point cmov for the specific X86 condition 3225/// code. Current x86 isa includes the following FP cmov instructions: 3226/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3227static bool hasFPCMov(unsigned X86CC) { 3228 switch (X86CC) { 3229 default: 3230 return false; 3231 case X86::COND_B: 3232 case X86::COND_BE: 3233 case X86::COND_E: 3234 case X86::COND_P: 3235 case X86::COND_A: 3236 case X86::COND_AE: 3237 case X86::COND_NE: 3238 case X86::COND_NP: 3239 return true; 3240 } 3241} 3242 3243/// isFPImmLegal - Returns true if the target can instruction select the 3244/// specified FP immediate natively. If false, the legalizer will 3245/// materialize the FP immediate as a load from a constant pool. 3246bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3247 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3248 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3249 return true; 3250 } 3251 return false; 3252} 3253 3254/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3255/// the specified range (L, H]. 3256static bool isUndefOrInRange(int Val, int Low, int Hi) { 3257 return (Val < 0) || (Val >= Low && Val < Hi); 3258} 3259 3260/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3261/// specified value. 3262static bool isUndefOrEqual(int Val, int CmpVal) { 3263 return (Val < 0 || Val == CmpVal); 3264} 3265 3266/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning 3267/// from position Pos and ending in Pos+Size, falls within the specified 3268/// sequential range (L, L+Pos]. or is undef. 3269static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 3270 unsigned Pos, unsigned Size, int Low) { 3271 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3272 if (!isUndefOrEqual(Mask[i], Low)) 3273 return false; 3274 return true; 3275} 3276 3277/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3278/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3279/// the second operand. 3280static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { 3281 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3282 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3283 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3284 return (Mask[0] < 2 && Mask[1] < 2); 3285 return false; 3286} 3287 3288/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3289/// is suitable for input to PSHUFHW. 3290static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { 3291 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 3292 return false; 3293 3294 // Lower quadword copied in order or undef. 3295 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) 3296 return false; 3297 3298 // Upper quadword shuffled. 3299 for (unsigned i = 4; i != 8; ++i) 3300 if (!isUndefOrInRange(Mask[i], 4, 8)) 3301 return false; 3302 3303 if (VT == MVT::v16i16) { 3304 // Lower quadword copied in order or undef. 3305 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) 3306 return false; 3307 3308 // Upper quadword shuffled. 3309 for (unsigned i = 12; i != 16; ++i) 3310 if (!isUndefOrInRange(Mask[i], 12, 16)) 3311 return false; 3312 } 3313 3314 return true; 3315} 3316 3317/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3318/// is suitable for input to PSHUFLW. 3319static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { 3320 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 3321 return false; 3322 3323 // Upper quadword copied in order. 3324 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) 3325 return false; 3326 3327 // Lower quadword shuffled. 3328 for (unsigned i = 0; i != 4; ++i) 3329 if (!isUndefOrInRange(Mask[i], 0, 4)) 3330 return false; 3331 3332 if (VT == MVT::v16i16) { 3333 // Upper quadword copied in order. 3334 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) 3335 return false; 3336 3337 // Lower quadword shuffled. 3338 for (unsigned i = 8; i != 12; ++i) 3339 if (!isUndefOrInRange(Mask[i], 8, 12)) 3340 return false; 3341 } 3342 3343 return true; 3344} 3345 3346/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3347/// is suitable for input to PALIGNR. 3348static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, 3349 const X86Subtarget *Subtarget) { 3350 if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) || 3351 (VT.getSizeInBits() == 256 && !Subtarget->hasInt256())) 3352 return false; 3353 3354 unsigned NumElts = VT.getVectorNumElements(); 3355 unsigned NumLanes = VT.getSizeInBits()/128; 3356 unsigned NumLaneElts = NumElts/NumLanes; 3357 3358 // Do not handle 64-bit element shuffles with palignr. 3359 if (NumLaneElts == 2) 3360 return false; 3361 3362 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { 3363 unsigned i; 3364 for (i = 0; i != NumLaneElts; ++i) { 3365 if (Mask[i+l] >= 0) 3366 break; 3367 } 3368 3369 // Lane is all undef, go to next lane 3370 if (i == NumLaneElts) 3371 continue; 3372 3373 int Start = Mask[i+l]; 3374 3375 // Make sure its in this lane in one of the sources 3376 if (!isUndefOrInRange(Start, l, l+NumLaneElts) && 3377 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) 3378 return false; 3379 3380 // If not lane 0, then we must match lane 0 3381 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) 3382 return false; 3383 3384 // Correct second source to be contiguous with first source 3385 if (Start >= (int)NumElts) 3386 Start -= NumElts - NumLaneElts; 3387 3388 // Make sure we're shifting in the right direction. 3389 if (Start <= (int)(i+l)) 3390 return false; 3391 3392 Start -= i; 3393 3394 // Check the rest of the elements to see if they are consecutive. 3395 for (++i; i != NumLaneElts; ++i) { 3396 int Idx = Mask[i+l]; 3397 3398 // Make sure its in this lane 3399 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && 3400 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) 3401 return false; 3402 3403 // If not lane 0, then we must match lane 0 3404 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) 3405 return false; 3406 3407 if (Idx >= (int)NumElts) 3408 Idx -= NumElts - NumLaneElts; 3409 3410 if (!isUndefOrEqual(Idx, Start+i)) 3411 return false; 3412 3413 } 3414 } 3415 3416 return true; 3417} 3418 3419/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3420/// the two vector operands have swapped position. 3421static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, 3422 unsigned NumElems) { 3423 for (unsigned i = 0; i != NumElems; ++i) { 3424 int idx = Mask[i]; 3425 if (idx < 0) 3426 continue; 3427 else if (idx < (int)NumElems) 3428 Mask[i] = idx + NumElems; 3429 else 3430 Mask[i] = idx - NumElems; 3431 } 3432} 3433 3434/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3435/// specifies a shuffle of elements that is suitable for input to 128/256-bit 3436/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be 3437/// reverse of what x86 shuffles want. 3438static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, 3439 bool Commuted = false) { 3440 if (!HasFp256 && VT.getSizeInBits() == 256) 3441 return false; 3442 3443 unsigned NumElems = VT.getVectorNumElements(); 3444 unsigned NumLanes = VT.getSizeInBits()/128; 3445 unsigned NumLaneElems = NumElems/NumLanes; 3446 3447 if (NumLaneElems != 2 && NumLaneElems != 4) 3448 return false; 3449 3450 // VSHUFPSY divides the resulting vector into 4 chunks. 3451 // The sources are also splitted into 4 chunks, and each destination 3452 // chunk must come from a different source chunk. 3453 // 3454 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 3455 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 3456 // 3457 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 3458 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 3459 // 3460 // VSHUFPDY divides the resulting vector into 4 chunks. 3461 // The sources are also splitted into 4 chunks, and each destination 3462 // chunk must come from a different source chunk. 3463 // 3464 // SRC1 => X3 X2 X1 X0 3465 // SRC2 => Y3 Y2 Y1 Y0 3466 // 3467 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 3468 // 3469 unsigned HalfLaneElems = NumLaneElems/2; 3470 for (unsigned l = 0; l != NumElems; l += NumLaneElems) { 3471 for (unsigned i = 0; i != NumLaneElems; ++i) { 3472 int Idx = Mask[i+l]; 3473 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); 3474 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) 3475 return false; 3476 // For VSHUFPSY, the mask of the second half must be the same as the 3477 // first but with the appropriate offsets. This works in the same way as 3478 // VPERMILPS works with masks. 3479 if (NumElems != 8 || l == 0 || Mask[i] < 0) 3480 continue; 3481 if (!isUndefOrEqual(Idx, Mask[i]+l)) 3482 return false; 3483 } 3484 } 3485 3486 return true; 3487} 3488 3489/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3490/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3491static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { 3492 if (!VT.is128BitVector()) 3493 return false; 3494 3495 unsigned NumElems = VT.getVectorNumElements(); 3496 3497 if (NumElems != 4) 3498 return false; 3499 3500 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3501 return isUndefOrEqual(Mask[0], 6) && 3502 isUndefOrEqual(Mask[1], 7) && 3503 isUndefOrEqual(Mask[2], 2) && 3504 isUndefOrEqual(Mask[3], 3); 3505} 3506 3507/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3508/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3509/// <2, 3, 2, 3> 3510static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { 3511 if (!VT.is128BitVector()) 3512 return false; 3513 3514 unsigned NumElems = VT.getVectorNumElements(); 3515 3516 if (NumElems != 4) 3517 return false; 3518 3519 return isUndefOrEqual(Mask[0], 2) && 3520 isUndefOrEqual(Mask[1], 3) && 3521 isUndefOrEqual(Mask[2], 2) && 3522 isUndefOrEqual(Mask[3], 3); 3523} 3524 3525/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3526/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3527static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { 3528 if (!VT.is128BitVector()) 3529 return false; 3530 3531 unsigned NumElems = VT.getVectorNumElements(); 3532 3533 if (NumElems != 2 && NumElems != 4) 3534 return false; 3535 3536 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3537 if (!isUndefOrEqual(Mask[i], i + NumElems)) 3538 return false; 3539 3540 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 3541 if (!isUndefOrEqual(Mask[i], i)) 3542 return false; 3543 3544 return true; 3545} 3546 3547/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3548/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3549static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { 3550 if (!VT.is128BitVector()) 3551 return false; 3552 3553 unsigned NumElems = VT.getVectorNumElements(); 3554 3555 if (NumElems != 2 && NumElems != 4) 3556 return false; 3557 3558 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3559 if (!isUndefOrEqual(Mask[i], i)) 3560 return false; 3561 3562 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3563 if (!isUndefOrEqual(Mask[i + e], i + NumElems)) 3564 return false; 3565 3566 return true; 3567} 3568 3569// 3570// Some special combinations that can be optimized. 3571// 3572static 3573SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, 3574 SelectionDAG &DAG) { 3575 EVT VT = SVOp->getValueType(0); 3576 DebugLoc dl = SVOp->getDebugLoc(); 3577 3578 if (VT != MVT::v8i32 && VT != MVT::v8f32) 3579 return SDValue(); 3580 3581 ArrayRef<int> Mask = SVOp->getMask(); 3582 3583 // These are the special masks that may be optimized. 3584 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; 3585 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; 3586 bool MatchEvenMask = true; 3587 bool MatchOddMask = true; 3588 for (int i=0; i<8; ++i) { 3589 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) 3590 MatchEvenMask = false; 3591 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) 3592 MatchOddMask = false; 3593 } 3594 3595 if (!MatchEvenMask && !MatchOddMask) 3596 return SDValue(); 3597 3598 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); 3599 3600 SDValue Op0 = SVOp->getOperand(0); 3601 SDValue Op1 = SVOp->getOperand(1); 3602 3603 if (MatchEvenMask) { 3604 // Shift the second operand right to 32 bits. 3605 static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; 3606 Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); 3607 } else { 3608 // Shift the first operand left to 32 bits. 3609 static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; 3610 Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); 3611 } 3612 static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; 3613 return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); 3614} 3615 3616/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3617/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3618static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, 3619 bool HasInt256, bool V2IsSplat = false) { 3620 unsigned NumElts = VT.getVectorNumElements(); 3621 3622 assert((VT.is128BitVector() || VT.is256BitVector()) && 3623 "Unsupported vector type for unpckh"); 3624 3625 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3626 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3627 return false; 3628 3629 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3630 // independently on 128-bit lanes. 3631 unsigned NumLanes = VT.getSizeInBits()/128; 3632 unsigned NumLaneElts = NumElts/NumLanes; 3633 3634 for (unsigned l = 0; l != NumLanes; ++l) { 3635 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3636 i != (l+1)*NumLaneElts; 3637 i += 2, ++j) { 3638 int BitI = Mask[i]; 3639 int BitI1 = Mask[i+1]; 3640 if (!isUndefOrEqual(BitI, j)) 3641 return false; 3642 if (V2IsSplat) { 3643 if (!isUndefOrEqual(BitI1, NumElts)) 3644 return false; 3645 } else { 3646 if (!isUndefOrEqual(BitI1, j + NumElts)) 3647 return false; 3648 } 3649 } 3650 } 3651 3652 return true; 3653} 3654 3655/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3656/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3657static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, 3658 bool HasInt256, bool V2IsSplat = false) { 3659 unsigned NumElts = VT.getVectorNumElements(); 3660 3661 assert((VT.is128BitVector() || VT.is256BitVector()) && 3662 "Unsupported vector type for unpckh"); 3663 3664 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3665 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3666 return false; 3667 3668 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3669 // independently on 128-bit lanes. 3670 unsigned NumLanes = VT.getSizeInBits()/128; 3671 unsigned NumLaneElts = NumElts/NumLanes; 3672 3673 for (unsigned l = 0; l != NumLanes; ++l) { 3674 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3675 i != (l+1)*NumLaneElts; i += 2, ++j) { 3676 int BitI = Mask[i]; 3677 int BitI1 = Mask[i+1]; 3678 if (!isUndefOrEqual(BitI, j)) 3679 return false; 3680 if (V2IsSplat) { 3681 if (isUndefOrEqual(BitI1, NumElts)) 3682 return false; 3683 } else { 3684 if (!isUndefOrEqual(BitI1, j+NumElts)) 3685 return false; 3686 } 3687 } 3688 } 3689 return true; 3690} 3691 3692/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3693/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3694/// <0, 0, 1, 1> 3695static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, 3696 bool HasInt256) { 3697 unsigned NumElts = VT.getVectorNumElements(); 3698 3699 assert((VT.is128BitVector() || VT.is256BitVector()) && 3700 "Unsupported vector type for unpckh"); 3701 3702 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3703 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3704 return false; 3705 3706 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 3707 // FIXME: Need a better way to get rid of this, there's no latency difference 3708 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 3709 // the former later. We should also remove the "_undef" special mask. 3710 if (NumElts == 4 && VT.getSizeInBits() == 256) 3711 return false; 3712 3713 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3714 // independently on 128-bit lanes. 3715 unsigned NumLanes = VT.getSizeInBits()/128; 3716 unsigned NumLaneElts = NumElts/NumLanes; 3717 3718 for (unsigned l = 0; l != NumLanes; ++l) { 3719 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3720 i != (l+1)*NumLaneElts; 3721 i += 2, ++j) { 3722 int BitI = Mask[i]; 3723 int BitI1 = Mask[i+1]; 3724 3725 if (!isUndefOrEqual(BitI, j)) 3726 return false; 3727 if (!isUndefOrEqual(BitI1, j)) 3728 return false; 3729 } 3730 } 3731 3732 return true; 3733} 3734 3735/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3736/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3737/// <2, 2, 3, 3> 3738static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { 3739 unsigned NumElts = VT.getVectorNumElements(); 3740 3741 assert((VT.is128BitVector() || VT.is256BitVector()) && 3742 "Unsupported vector type for unpckh"); 3743 3744 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3745 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 3746 return false; 3747 3748 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3749 // independently on 128-bit lanes. 3750 unsigned NumLanes = VT.getSizeInBits()/128; 3751 unsigned NumLaneElts = NumElts/NumLanes; 3752 3753 for (unsigned l = 0; l != NumLanes; ++l) { 3754 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3755 i != (l+1)*NumLaneElts; i += 2, ++j) { 3756 int BitI = Mask[i]; 3757 int BitI1 = Mask[i+1]; 3758 if (!isUndefOrEqual(BitI, j)) 3759 return false; 3760 if (!isUndefOrEqual(BitI1, j)) 3761 return false; 3762 } 3763 } 3764 return true; 3765} 3766 3767/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3768/// specifies a shuffle of elements that is suitable for input to MOVSS, 3769/// MOVSD, and MOVD, i.e. setting the lowest element. 3770static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { 3771 if (VT.getVectorElementType().getSizeInBits() < 32) 3772 return false; 3773 if (!VT.is128BitVector()) 3774 return false; 3775 3776 unsigned NumElts = VT.getVectorNumElements(); 3777 3778 if (!isUndefOrEqual(Mask[0], NumElts)) 3779 return false; 3780 3781 for (unsigned i = 1; i != NumElts; ++i) 3782 if (!isUndefOrEqual(Mask[i], i)) 3783 return false; 3784 3785 return true; 3786} 3787 3788/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered 3789/// as permutations between 128-bit chunks or halves. As an example: this 3790/// shuffle bellow: 3791/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 3792/// The first half comes from the second half of V1 and the second half from the 3793/// the second half of V2. 3794static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { 3795 if (!HasFp256 || !VT.is256BitVector()) 3796 return false; 3797 3798 // The shuffle result is divided into half A and half B. In total the two 3799 // sources have 4 halves, namely: C, D, E, F. The final values of A and 3800 // B must come from C, D, E or F. 3801 unsigned HalfSize = VT.getVectorNumElements()/2; 3802 bool MatchA = false, MatchB = false; 3803 3804 // Check if A comes from one of C, D, E, F. 3805 for (unsigned Half = 0; Half != 4; ++Half) { 3806 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 3807 MatchA = true; 3808 break; 3809 } 3810 } 3811 3812 // Check if B comes from one of C, D, E, F. 3813 for (unsigned Half = 0; Half != 4; ++Half) { 3814 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 3815 MatchB = true; 3816 break; 3817 } 3818 } 3819 3820 return MatchA && MatchB; 3821} 3822 3823/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle 3824/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. 3825static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { 3826 EVT VT = SVOp->getValueType(0); 3827 3828 unsigned HalfSize = VT.getVectorNumElements()/2; 3829 3830 unsigned FstHalf = 0, SndHalf = 0; 3831 for (unsigned i = 0; i < HalfSize; ++i) { 3832 if (SVOp->getMaskElt(i) > 0) { 3833 FstHalf = SVOp->getMaskElt(i)/HalfSize; 3834 break; 3835 } 3836 } 3837 for (unsigned i = HalfSize; i < HalfSize*2; ++i) { 3838 if (SVOp->getMaskElt(i) > 0) { 3839 SndHalf = SVOp->getMaskElt(i)/HalfSize; 3840 break; 3841 } 3842 } 3843 3844 return (FstHalf | (SndHalf << 4)); 3845} 3846 3847/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand 3848/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 3849/// Note that VPERMIL mask matching is different depending whether theunderlying 3850/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3851/// to the same elements of the low, but to the higher half of the source. 3852/// In VPERMILPD the two lanes could be shuffled independently of each other 3853/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. 3854static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { 3855 if (!HasFp256) 3856 return false; 3857 3858 unsigned NumElts = VT.getVectorNumElements(); 3859 // Only match 256-bit with 32/64-bit types 3860 if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) 3861 return false; 3862 3863 unsigned NumLanes = VT.getSizeInBits()/128; 3864 unsigned LaneSize = NumElts/NumLanes; 3865 for (unsigned l = 0; l != NumElts; l += LaneSize) { 3866 for (unsigned i = 0; i != LaneSize; ++i) { 3867 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 3868 return false; 3869 if (NumElts != 8 || l == 0) 3870 continue; 3871 // VPERMILPS handling 3872 if (Mask[i] < 0) 3873 continue; 3874 if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) 3875 return false; 3876 } 3877 } 3878 3879 return true; 3880} 3881 3882/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse 3883/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3884/// element of vector 2 and the other elements to come from vector 1 in order. 3885static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, 3886 bool V2IsSplat = false, bool V2IsUndef = false) { 3887 if (!VT.is128BitVector()) 3888 return false; 3889 3890 unsigned NumOps = VT.getVectorNumElements(); 3891 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3892 return false; 3893 3894 if (!isUndefOrEqual(Mask[0], 0)) 3895 return false; 3896 3897 for (unsigned i = 1; i != NumOps; ++i) 3898 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3899 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3900 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3901 return false; 3902 3903 return true; 3904} 3905 3906/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3907/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3908/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3909static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, 3910 const X86Subtarget *Subtarget) { 3911 if (!Subtarget->hasSSE3()) 3912 return false; 3913 3914 unsigned NumElems = VT.getVectorNumElements(); 3915 3916 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3917 (VT.getSizeInBits() == 256 && NumElems != 8)) 3918 return false; 3919 3920 // "i+1" is the value the indexed mask element must have 3921 for (unsigned i = 0; i != NumElems; i += 2) 3922 if (!isUndefOrEqual(Mask[i], i+1) || 3923 !isUndefOrEqual(Mask[i+1], i+1)) 3924 return false; 3925 3926 return true; 3927} 3928 3929/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3930/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3931/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3932static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, 3933 const X86Subtarget *Subtarget) { 3934 if (!Subtarget->hasSSE3()) 3935 return false; 3936 3937 unsigned NumElems = VT.getVectorNumElements(); 3938 3939 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3940 (VT.getSizeInBits() == 256 && NumElems != 8)) 3941 return false; 3942 3943 // "i" is the value the indexed mask element must have 3944 for (unsigned i = 0; i != NumElems; i += 2) 3945 if (!isUndefOrEqual(Mask[i], i) || 3946 !isUndefOrEqual(Mask[i+1], i)) 3947 return false; 3948 3949 return true; 3950} 3951 3952/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 3953/// specifies a shuffle of elements that is suitable for input to 256-bit 3954/// version of MOVDDUP. 3955static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { 3956 if (!HasFp256 || !VT.is256BitVector()) 3957 return false; 3958 3959 unsigned NumElts = VT.getVectorNumElements(); 3960 if (NumElts != 4) 3961 return false; 3962 3963 for (unsigned i = 0; i != NumElts/2; ++i) 3964 if (!isUndefOrEqual(Mask[i], 0)) 3965 return false; 3966 for (unsigned i = NumElts/2; i != NumElts; ++i) 3967 if (!isUndefOrEqual(Mask[i], NumElts/2)) 3968 return false; 3969 return true; 3970} 3971 3972/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3973/// specifies a shuffle of elements that is suitable for input to 128-bit 3974/// version of MOVDDUP. 3975static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { 3976 if (!VT.is128BitVector()) 3977 return false; 3978 3979 unsigned e = VT.getVectorNumElements() / 2; 3980 for (unsigned i = 0; i != e; ++i) 3981 if (!isUndefOrEqual(Mask[i], i)) 3982 return false; 3983 for (unsigned i = 0; i != e; ++i) 3984 if (!isUndefOrEqual(Mask[e+i], i)) 3985 return false; 3986 return true; 3987} 3988 3989/// isVEXTRACTF128Index - Return true if the specified 3990/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3991/// suitable for input to VEXTRACTF128. 3992bool X86::isVEXTRACTF128Index(SDNode *N) { 3993 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3994 return false; 3995 3996 // The index should be aligned on a 128-bit boundary. 3997 uint64_t Index = 3998 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3999 4000 unsigned VL = N->getValueType(0).getVectorNumElements(); 4001 unsigned VBits = N->getValueType(0).getSizeInBits(); 4002 unsigned ElSize = VBits / VL; 4003 bool Result = (Index * ElSize) % 128 == 0; 4004 4005 return Result; 4006} 4007 4008/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 4009/// operand specifies a subvector insert that is suitable for input to 4010/// VINSERTF128. 4011bool X86::isVINSERTF128Index(SDNode *N) { 4012 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4013 return false; 4014 4015 // The index should be aligned on a 128-bit boundary. 4016 uint64_t Index = 4017 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4018 4019 unsigned VL = N->getValueType(0).getVectorNumElements(); 4020 unsigned VBits = N->getValueType(0).getSizeInBits(); 4021 unsigned ElSize = VBits / VL; 4022 bool Result = (Index * ElSize) % 128 == 0; 4023 4024 return Result; 4025} 4026 4027/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 4028/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 4029/// Handles 128-bit and 256-bit. 4030static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { 4031 EVT VT = N->getValueType(0); 4032 4033 assert((VT.is128BitVector() || VT.is256BitVector()) && 4034 "Unsupported vector type for PSHUF/SHUFP"); 4035 4036 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate 4037 // independently on 128-bit lanes. 4038 unsigned NumElts = VT.getVectorNumElements(); 4039 unsigned NumLanes = VT.getSizeInBits()/128; 4040 unsigned NumLaneElts = NumElts/NumLanes; 4041 4042 assert((NumLaneElts == 2 || NumLaneElts == 4) && 4043 "Only supports 2 or 4 elements per lane"); 4044 4045 unsigned Shift = (NumLaneElts == 4) ? 1 : 0; 4046 unsigned Mask = 0; 4047 for (unsigned i = 0; i != NumElts; ++i) { 4048 int Elt = N->getMaskElt(i); 4049 if (Elt < 0) continue; 4050 Elt &= NumLaneElts - 1; 4051 unsigned ShAmt = (i << Shift) % 8; 4052 Mask |= Elt << ShAmt; 4053 } 4054 4055 return Mask; 4056} 4057 4058/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 4059/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 4060static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { 4061 EVT VT = N->getValueType(0); 4062 4063 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4064 "Unsupported vector type for PSHUFHW"); 4065 4066 unsigned NumElts = VT.getVectorNumElements(); 4067 4068 unsigned Mask = 0; 4069 for (unsigned l = 0; l != NumElts; l += 8) { 4070 // 8 nodes per lane, but we only care about the last 4. 4071 for (unsigned i = 0; i < 4; ++i) { 4072 int Elt = N->getMaskElt(l+i+4); 4073 if (Elt < 0) continue; 4074 Elt &= 0x3; // only 2-bits. 4075 Mask |= Elt << (i * 2); 4076 } 4077 } 4078 4079 return Mask; 4080} 4081 4082/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 4083/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 4084static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { 4085 EVT VT = N->getValueType(0); 4086 4087 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4088 "Unsupported vector type for PSHUFHW"); 4089 4090 unsigned NumElts = VT.getVectorNumElements(); 4091 4092 unsigned Mask = 0; 4093 for (unsigned l = 0; l != NumElts; l += 8) { 4094 // 8 nodes per lane, but we only care about the first 4. 4095 for (unsigned i = 0; i < 4; ++i) { 4096 int Elt = N->getMaskElt(l+i); 4097 if (Elt < 0) continue; 4098 Elt &= 0x3; // only 2-bits 4099 Mask |= Elt << (i * 2); 4100 } 4101 } 4102 4103 return Mask; 4104} 4105 4106/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 4107/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 4108static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { 4109 EVT VT = SVOp->getValueType(0); 4110 unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; 4111 4112 unsigned NumElts = VT.getVectorNumElements(); 4113 unsigned NumLanes = VT.getSizeInBits()/128; 4114 unsigned NumLaneElts = NumElts/NumLanes; 4115 4116 int Val = 0; 4117 unsigned i; 4118 for (i = 0; i != NumElts; ++i) { 4119 Val = SVOp->getMaskElt(i); 4120 if (Val >= 0) 4121 break; 4122 } 4123 if (Val >= (int)NumElts) 4124 Val -= NumElts - NumLaneElts; 4125 4126 assert(Val - i > 0 && "PALIGNR imm should be positive"); 4127 return (Val - i) * EltSize; 4128} 4129 4130/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 4131/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 4132/// instructions. 4133unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 4134 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4135 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 4136 4137 uint64_t Index = 4138 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4139 4140 EVT VecVT = N->getOperand(0).getValueType(); 4141 EVT ElVT = VecVT.getVectorElementType(); 4142 4143 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4144 return Index / NumElemsPerChunk; 4145} 4146 4147/// getInsertVINSERTF128Immediate - Return the appropriate immediate 4148/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 4149/// instructions. 4150unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 4151 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4152 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 4153 4154 uint64_t Index = 4155 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4156 4157 EVT VecVT = N->getValueType(0); 4158 EVT ElVT = VecVT.getVectorElementType(); 4159 4160 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4161 return Index / NumElemsPerChunk; 4162} 4163 4164/// getShuffleCLImmediate - Return the appropriate immediate to shuffle 4165/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. 4166/// Handles 256-bit. 4167static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { 4168 EVT VT = N->getValueType(0); 4169 4170 unsigned NumElts = VT.getVectorNumElements(); 4171 4172 assert((VT.is256BitVector() && NumElts == 4) && 4173 "Unsupported vector type for VPERMQ/VPERMPD"); 4174 4175 unsigned Mask = 0; 4176 for (unsigned i = 0; i != NumElts; ++i) { 4177 int Elt = N->getMaskElt(i); 4178 if (Elt < 0) 4179 continue; 4180 Mask |= Elt << (i*2); 4181 } 4182 4183 return Mask; 4184} 4185/// isZeroNode - Returns true if Elt is a constant zero or a floating point 4186/// constant +0.0. 4187bool X86::isZeroNode(SDValue Elt) { 4188 return ((isa<ConstantSDNode>(Elt) && 4189 cast<ConstantSDNode>(Elt)->isNullValue()) || 4190 (isa<ConstantFPSDNode>(Elt) && 4191 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 4192} 4193 4194/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 4195/// their permute mask. 4196static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 4197 SelectionDAG &DAG) { 4198 EVT VT = SVOp->getValueType(0); 4199 unsigned NumElems = VT.getVectorNumElements(); 4200 SmallVector<int, 8> MaskVec; 4201 4202 for (unsigned i = 0; i != NumElems; ++i) { 4203 int Idx = SVOp->getMaskElt(i); 4204 if (Idx >= 0) { 4205 if (Idx < (int)NumElems) 4206 Idx += NumElems; 4207 else 4208 Idx -= NumElems; 4209 } 4210 MaskVec.push_back(Idx); 4211 } 4212 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 4213 SVOp->getOperand(0), &MaskVec[0]); 4214} 4215 4216/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 4217/// match movhlps. The lower half elements should come from upper half of 4218/// V1 (and in order), and the upper half elements should come from the upper 4219/// half of V2 (and in order). 4220static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { 4221 if (!VT.is128BitVector()) 4222 return false; 4223 if (VT.getVectorNumElements() != 4) 4224 return false; 4225 for (unsigned i = 0, e = 2; i != e; ++i) 4226 if (!isUndefOrEqual(Mask[i], i+2)) 4227 return false; 4228 for (unsigned i = 2; i != 4; ++i) 4229 if (!isUndefOrEqual(Mask[i], i+4)) 4230 return false; 4231 return true; 4232} 4233 4234/// isScalarLoadToVector - Returns true if the node is a scalar load that 4235/// is promoted to a vector. It also returns the LoadSDNode by reference if 4236/// required. 4237static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 4238 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 4239 return false; 4240 N = N->getOperand(0).getNode(); 4241 if (!ISD::isNON_EXTLoad(N)) 4242 return false; 4243 if (LD) 4244 *LD = cast<LoadSDNode>(N); 4245 return true; 4246} 4247 4248// Test whether the given value is a vector value which will be legalized 4249// into a load. 4250static bool WillBeConstantPoolLoad(SDNode *N) { 4251 if (N->getOpcode() != ISD::BUILD_VECTOR) 4252 return false; 4253 4254 // Check for any non-constant elements. 4255 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 4256 switch (N->getOperand(i).getNode()->getOpcode()) { 4257 case ISD::UNDEF: 4258 case ISD::ConstantFP: 4259 case ISD::Constant: 4260 break; 4261 default: 4262 return false; 4263 } 4264 4265 // Vectors of all-zeros and all-ones are materialized with special 4266 // instructions rather than being loaded. 4267 return !ISD::isBuildVectorAllZeros(N) && 4268 !ISD::isBuildVectorAllOnes(N); 4269} 4270 4271/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 4272/// match movlp{s|d}. The lower half elements should come from lower half of 4273/// V1 (and in order), and the upper half elements should come from the upper 4274/// half of V2 (and in order). And since V1 will become the source of the 4275/// MOVLP, it must be either a vector load or a scalar load to vector. 4276static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 4277 ArrayRef<int> Mask, EVT VT) { 4278 if (!VT.is128BitVector()) 4279 return false; 4280 4281 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 4282 return false; 4283 // Is V2 is a vector load, don't do this transformation. We will try to use 4284 // load folding shufps op. 4285 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) 4286 return false; 4287 4288 unsigned NumElems = VT.getVectorNumElements(); 4289 4290 if (NumElems != 2 && NumElems != 4) 4291 return false; 4292 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4293 if (!isUndefOrEqual(Mask[i], i)) 4294 return false; 4295 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 4296 if (!isUndefOrEqual(Mask[i], i+NumElems)) 4297 return false; 4298 return true; 4299} 4300 4301/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4302/// all the same. 4303static bool isSplatVector(SDNode *N) { 4304 if (N->getOpcode() != ISD::BUILD_VECTOR) 4305 return false; 4306 4307 SDValue SplatValue = N->getOperand(0); 4308 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4309 if (N->getOperand(i) != SplatValue) 4310 return false; 4311 return true; 4312} 4313 4314/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4315/// to an zero vector. 4316/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4317static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4318 SDValue V1 = N->getOperand(0); 4319 SDValue V2 = N->getOperand(1); 4320 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4321 for (unsigned i = 0; i != NumElems; ++i) { 4322 int Idx = N->getMaskElt(i); 4323 if (Idx >= (int)NumElems) { 4324 unsigned Opc = V2.getOpcode(); 4325 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4326 continue; 4327 if (Opc != ISD::BUILD_VECTOR || 4328 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4329 return false; 4330 } else if (Idx >= 0) { 4331 unsigned Opc = V1.getOpcode(); 4332 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4333 continue; 4334 if (Opc != ISD::BUILD_VECTOR || 4335 !X86::isZeroNode(V1.getOperand(Idx))) 4336 return false; 4337 } 4338 } 4339 return true; 4340} 4341 4342/// getZeroVector - Returns a vector of specified type with all zero elements. 4343/// 4344static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, 4345 SelectionDAG &DAG, DebugLoc dl) { 4346 assert(VT.isVector() && "Expected a vector type"); 4347 unsigned Size = VT.getSizeInBits(); 4348 4349 // Always build SSE zero vectors as <4 x i32> bitcasted 4350 // to their dest type. This ensures they get CSE'd. 4351 SDValue Vec; 4352 if (Size == 128) { // SSE 4353 if (Subtarget->hasSSE2()) { // SSE2 4354 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4355 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4356 } else { // SSE1 4357 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4358 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4359 } 4360 } else if (Size == 256) { // AVX 4361 if (Subtarget->hasInt256()) { // AVX2 4362 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4363 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4364 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4365 } else { 4366 // 256-bit logic and arithmetic instructions in AVX are all 4367 // floating-point, no support for integer ops. Emit fp zeroed vectors. 4368 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4369 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4370 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 4371 } 4372 } else 4373 llvm_unreachable("Unexpected vector type"); 4374 4375 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4376} 4377 4378/// getOnesVector - Returns a vector of specified type with all bits set. 4379/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4380/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4381/// Then bitcast to their original type, ensuring they get CSE'd. 4382static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG, 4383 DebugLoc dl) { 4384 assert(VT.isVector() && "Expected a vector type"); 4385 unsigned Size = VT.getSizeInBits(); 4386 4387 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4388 SDValue Vec; 4389 if (Size == 256) { 4390 if (HasInt256) { // AVX2 4391 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4392 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4393 } else { // AVX 4394 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4395 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 4396 } 4397 } else if (Size == 128) { 4398 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4399 } else 4400 llvm_unreachable("Unexpected vector type"); 4401 4402 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4403} 4404 4405/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4406/// that point to V2 points to its first element. 4407static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { 4408 for (unsigned i = 0; i != NumElems; ++i) { 4409 if (Mask[i] > (int)NumElems) { 4410 Mask[i] = NumElems; 4411 } 4412 } 4413} 4414 4415/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4416/// operation of specified width. 4417static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4418 SDValue V2) { 4419 unsigned NumElems = VT.getVectorNumElements(); 4420 SmallVector<int, 8> Mask; 4421 Mask.push_back(NumElems); 4422 for (unsigned i = 1; i != NumElems; ++i) 4423 Mask.push_back(i); 4424 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4425} 4426 4427/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4428static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4429 SDValue V2) { 4430 unsigned NumElems = VT.getVectorNumElements(); 4431 SmallVector<int, 8> Mask; 4432 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4433 Mask.push_back(i); 4434 Mask.push_back(i + NumElems); 4435 } 4436 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4437} 4438 4439/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4440static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4441 SDValue V2) { 4442 unsigned NumElems = VT.getVectorNumElements(); 4443 SmallVector<int, 8> Mask; 4444 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 4445 Mask.push_back(i + Half); 4446 Mask.push_back(i + NumElems + Half); 4447 } 4448 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4449} 4450 4451// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4452// a generic shuffle instruction because the target has no such instructions. 4453// Generate shuffles which repeat i16 and i8 several times until they can be 4454// represented by v4f32 and then be manipulated by target suported shuffles. 4455static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4456 EVT VT = V.getValueType(); 4457 int NumElems = VT.getVectorNumElements(); 4458 DebugLoc dl = V.getDebugLoc(); 4459 4460 while (NumElems > 4) { 4461 if (EltNo < NumElems/2) { 4462 V = getUnpackl(DAG, dl, VT, V, V); 4463 } else { 4464 V = getUnpackh(DAG, dl, VT, V, V); 4465 EltNo -= NumElems/2; 4466 } 4467 NumElems >>= 1; 4468 } 4469 return V; 4470} 4471 4472/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4473static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4474 EVT VT = V.getValueType(); 4475 DebugLoc dl = V.getDebugLoc(); 4476 unsigned Size = VT.getSizeInBits(); 4477 4478 if (Size == 128) { 4479 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 4480 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4481 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 4482 &SplatMask[0]); 4483 } else if (Size == 256) { 4484 // To use VPERMILPS to splat scalars, the second half of indicies must 4485 // refer to the higher part, which is a duplication of the lower one, 4486 // because VPERMILPS can only handle in-lane permutations. 4487 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4488 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4489 4490 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 4491 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 4492 &SplatMask[0]); 4493 } else 4494 llvm_unreachable("Vector size not supported"); 4495 4496 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4497} 4498 4499/// PromoteSplat - Splat is promoted to target supported vector shuffles. 4500static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4501 EVT SrcVT = SV->getValueType(0); 4502 SDValue V1 = SV->getOperand(0); 4503 DebugLoc dl = SV->getDebugLoc(); 4504 4505 int EltNo = SV->getSplatIndex(); 4506 int NumElems = SrcVT.getVectorNumElements(); 4507 unsigned Size = SrcVT.getSizeInBits(); 4508 4509 assert(((Size == 128 && NumElems > 4) || Size == 256) && 4510 "Unknown how to promote splat for type"); 4511 4512 // Extract the 128-bit part containing the splat element and update 4513 // the splat element index when it refers to the higher register. 4514 if (Size == 256) { 4515 V1 = Extract128BitVector(V1, EltNo, DAG, dl); 4516 if (EltNo >= NumElems/2) 4517 EltNo -= NumElems/2; 4518 } 4519 4520 // All i16 and i8 vector types can't be used directly by a generic shuffle 4521 // instruction because the target has no such instruction. Generate shuffles 4522 // which repeat i16 and i8 several times until they fit in i32, and then can 4523 // be manipulated by target suported shuffles. 4524 EVT EltVT = SrcVT.getVectorElementType(); 4525 if (EltVT == MVT::i8 || EltVT == MVT::i16) 4526 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4527 4528 // Recreate the 256-bit vector and place the same 128-bit vector 4529 // into the low and high part. This is necessary because we want 4530 // to use VPERM* to shuffle the vectors 4531 if (Size == 256) { 4532 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); 4533 } 4534 4535 return getLegalSplat(DAG, V1, EltNo); 4536} 4537 4538/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4539/// vector of zero or undef vector. This produces a shuffle where the low 4540/// element of V2 is swizzled into the zero/undef vector, landing at element 4541/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4542static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4543 bool IsZero, 4544 const X86Subtarget *Subtarget, 4545 SelectionDAG &DAG) { 4546 EVT VT = V2.getValueType(); 4547 SDValue V1 = IsZero 4548 ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4549 unsigned NumElems = VT.getVectorNumElements(); 4550 SmallVector<int, 16> MaskVec; 4551 for (unsigned i = 0; i != NumElems; ++i) 4552 // If this is the insertion idx, put the low elt of V2 here. 4553 MaskVec.push_back(i == Idx ? NumElems : i); 4554 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4555} 4556 4557/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the 4558/// target specific opcode. Returns true if the Mask could be calculated. 4559/// Sets IsUnary to true if only uses one source. 4560static bool getTargetShuffleMask(SDNode *N, MVT VT, 4561 SmallVectorImpl<int> &Mask, bool &IsUnary) { 4562 unsigned NumElems = VT.getVectorNumElements(); 4563 SDValue ImmN; 4564 4565 IsUnary = false; 4566 switch(N->getOpcode()) { 4567 case X86ISD::SHUFP: 4568 ImmN = N->getOperand(N->getNumOperands()-1); 4569 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4570 break; 4571 case X86ISD::UNPCKH: 4572 DecodeUNPCKHMask(VT, Mask); 4573 break; 4574 case X86ISD::UNPCKL: 4575 DecodeUNPCKLMask(VT, Mask); 4576 break; 4577 case X86ISD::MOVHLPS: 4578 DecodeMOVHLPSMask(NumElems, Mask); 4579 break; 4580 case X86ISD::MOVLHPS: 4581 DecodeMOVLHPSMask(NumElems, Mask); 4582 break; 4583 case X86ISD::PSHUFD: 4584 case X86ISD::VPERMILP: 4585 ImmN = N->getOperand(N->getNumOperands()-1); 4586 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4587 IsUnary = true; 4588 break; 4589 case X86ISD::PSHUFHW: 4590 ImmN = N->getOperand(N->getNumOperands()-1); 4591 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4592 IsUnary = true; 4593 break; 4594 case X86ISD::PSHUFLW: 4595 ImmN = N->getOperand(N->getNumOperands()-1); 4596 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4597 IsUnary = true; 4598 break; 4599 case X86ISD::VPERMI: 4600 ImmN = N->getOperand(N->getNumOperands()-1); 4601 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4602 IsUnary = true; 4603 break; 4604 case X86ISD::MOVSS: 4605 case X86ISD::MOVSD: { 4606 // The index 0 always comes from the first element of the second source, 4607 // this is why MOVSS and MOVSD are used in the first place. The other 4608 // elements come from the other positions of the first source vector 4609 Mask.push_back(NumElems); 4610 for (unsigned i = 1; i != NumElems; ++i) { 4611 Mask.push_back(i); 4612 } 4613 break; 4614 } 4615 case X86ISD::VPERM2X128: 4616 ImmN = N->getOperand(N->getNumOperands()-1); 4617 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4618 if (Mask.empty()) return false; 4619 break; 4620 case X86ISD::MOVDDUP: 4621 case X86ISD::MOVLHPD: 4622 case X86ISD::MOVLPD: 4623 case X86ISD::MOVLPS: 4624 case X86ISD::MOVSHDUP: 4625 case X86ISD::MOVSLDUP: 4626 case X86ISD::PALIGN: 4627 // Not yet implemented 4628 return false; 4629 default: llvm_unreachable("unknown target shuffle node"); 4630 } 4631 4632 return true; 4633} 4634 4635/// getShuffleScalarElt - Returns the scalar element that will make up the ith 4636/// element of the result of the vector shuffle. 4637static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 4638 unsigned Depth) { 4639 if (Depth == 6) 4640 return SDValue(); // Limit search depth. 4641 4642 SDValue V = SDValue(N, 0); 4643 EVT VT = V.getValueType(); 4644 unsigned Opcode = V.getOpcode(); 4645 4646 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4647 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4648 int Elt = SV->getMaskElt(Index); 4649 4650 if (Elt < 0) 4651 return DAG.getUNDEF(VT.getVectorElementType()); 4652 4653 unsigned NumElems = VT.getVectorNumElements(); 4654 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 4655 : SV->getOperand(1); 4656 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 4657 } 4658 4659 // Recurse into target specific vector shuffles to find scalars. 4660 if (isTargetShuffle(Opcode)) { 4661 MVT ShufVT = V.getValueType().getSimpleVT(); 4662 unsigned NumElems = ShufVT.getVectorNumElements(); 4663 SmallVector<int, 16> ShuffleMask; 4664 bool IsUnary; 4665 4666 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) 4667 return SDValue(); 4668 4669 int Elt = ShuffleMask[Index]; 4670 if (Elt < 0) 4671 return DAG.getUNDEF(ShufVT.getVectorElementType()); 4672 4673 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) 4674 : N->getOperand(1); 4675 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 4676 Depth+1); 4677 } 4678 4679 // Actual nodes that may contain scalar elements 4680 if (Opcode == ISD::BITCAST) { 4681 V = V.getOperand(0); 4682 EVT SrcVT = V.getValueType(); 4683 unsigned NumElems = VT.getVectorNumElements(); 4684 4685 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4686 return SDValue(); 4687 } 4688 4689 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4690 return (Index == 0) ? V.getOperand(0) 4691 : DAG.getUNDEF(VT.getVectorElementType()); 4692 4693 if (V.getOpcode() == ISD::BUILD_VECTOR) 4694 return V.getOperand(Index); 4695 4696 return SDValue(); 4697} 4698 4699/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4700/// shuffle operation which come from a consecutively from a zero. The 4701/// search can start in two different directions, from left or right. 4702static 4703unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, 4704 bool ZerosFromLeft, SelectionDAG &DAG) { 4705 unsigned i; 4706 for (i = 0; i != NumElems; ++i) { 4707 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4708 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); 4709 if (!(Elt.getNode() && 4710 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4711 break; 4712 } 4713 4714 return i; 4715} 4716 4717/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) 4718/// correspond consecutively to elements from one of the vector operands, 4719/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4720static 4721bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, 4722 unsigned MaskI, unsigned MaskE, unsigned OpIdx, 4723 unsigned NumElems, unsigned &OpNum) { 4724 bool SeenV1 = false; 4725 bool SeenV2 = false; 4726 4727 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { 4728 int Idx = SVOp->getMaskElt(i); 4729 // Ignore undef indicies 4730 if (Idx < 0) 4731 continue; 4732 4733 if (Idx < (int)NumElems) 4734 SeenV1 = true; 4735 else 4736 SeenV2 = true; 4737 4738 // Only accept consecutive elements from the same vector 4739 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4740 return false; 4741 } 4742 4743 OpNum = SeenV1 ? 0 : 1; 4744 return true; 4745} 4746 4747/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4748/// logical left shift of a vector. 4749static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4750 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4751 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4752 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4753 false /* check zeros from right */, DAG); 4754 unsigned OpSrc; 4755 4756 if (!NumZeros) 4757 return false; 4758 4759 // Considering the elements in the mask that are not consecutive zeros, 4760 // check if they consecutively come from only one of the source vectors. 4761 // 4762 // V1 = {X, A, B, C} 0 4763 // \ \ \ / 4764 // vector_shuffle V1, V2 <1, 2, 3, X> 4765 // 4766 if (!isShuffleMaskConsecutive(SVOp, 4767 0, // Mask Start Index 4768 NumElems-NumZeros, // Mask End Index(exclusive) 4769 NumZeros, // Where to start looking in the src vector 4770 NumElems, // Number of elements in vector 4771 OpSrc)) // Which source operand ? 4772 return false; 4773 4774 isLeft = false; 4775 ShAmt = NumZeros; 4776 ShVal = SVOp->getOperand(OpSrc); 4777 return true; 4778} 4779 4780/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4781/// logical left shift of a vector. 4782static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4783 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4784 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4785 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4786 true /* check zeros from left */, DAG); 4787 unsigned OpSrc; 4788 4789 if (!NumZeros) 4790 return false; 4791 4792 // Considering the elements in the mask that are not consecutive zeros, 4793 // check if they consecutively come from only one of the source vectors. 4794 // 4795 // 0 { A, B, X, X } = V2 4796 // / \ / / 4797 // vector_shuffle V1, V2 <X, X, 4, 5> 4798 // 4799 if (!isShuffleMaskConsecutive(SVOp, 4800 NumZeros, // Mask Start Index 4801 NumElems, // Mask End Index(exclusive) 4802 0, // Where to start looking in the src vector 4803 NumElems, // Number of elements in vector 4804 OpSrc)) // Which source operand ? 4805 return false; 4806 4807 isLeft = true; 4808 ShAmt = NumZeros; 4809 ShVal = SVOp->getOperand(OpSrc); 4810 return true; 4811} 4812 4813/// isVectorShift - Returns true if the shuffle can be implemented as a 4814/// logical left or right shift of a vector. 4815static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4816 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4817 // Although the logic below support any bitwidth size, there are no 4818 // shift instructions which handle more than 128-bit vectors. 4819 if (!SVOp->getValueType(0).is128BitVector()) 4820 return false; 4821 4822 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4823 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4824 return true; 4825 4826 return false; 4827} 4828 4829/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4830/// 4831static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4832 unsigned NumNonZero, unsigned NumZero, 4833 SelectionDAG &DAG, 4834 const X86Subtarget* Subtarget, 4835 const TargetLowering &TLI) { 4836 if (NumNonZero > 8) 4837 return SDValue(); 4838 4839 DebugLoc dl = Op.getDebugLoc(); 4840 SDValue V(0, 0); 4841 bool First = true; 4842 for (unsigned i = 0; i < 16; ++i) { 4843 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4844 if (ThisIsNonZero && First) { 4845 if (NumZero) 4846 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4847 else 4848 V = DAG.getUNDEF(MVT::v8i16); 4849 First = false; 4850 } 4851 4852 if ((i & 1) != 0) { 4853 SDValue ThisElt(0, 0), LastElt(0, 0); 4854 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4855 if (LastIsNonZero) { 4856 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4857 MVT::i16, Op.getOperand(i-1)); 4858 } 4859 if (ThisIsNonZero) { 4860 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4861 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4862 ThisElt, DAG.getConstant(8, MVT::i8)); 4863 if (LastIsNonZero) 4864 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4865 } else 4866 ThisElt = LastElt; 4867 4868 if (ThisElt.getNode()) 4869 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4870 DAG.getIntPtrConstant(i/2)); 4871 } 4872 } 4873 4874 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4875} 4876 4877/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4878/// 4879static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4880 unsigned NumNonZero, unsigned NumZero, 4881 SelectionDAG &DAG, 4882 const X86Subtarget* Subtarget, 4883 const TargetLowering &TLI) { 4884 if (NumNonZero > 4) 4885 return SDValue(); 4886 4887 DebugLoc dl = Op.getDebugLoc(); 4888 SDValue V(0, 0); 4889 bool First = true; 4890 for (unsigned i = 0; i < 8; ++i) { 4891 bool isNonZero = (NonZeros & (1 << i)) != 0; 4892 if (isNonZero) { 4893 if (First) { 4894 if (NumZero) 4895 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4896 else 4897 V = DAG.getUNDEF(MVT::v8i16); 4898 First = false; 4899 } 4900 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4901 MVT::v8i16, V, Op.getOperand(i), 4902 DAG.getIntPtrConstant(i)); 4903 } 4904 } 4905 4906 return V; 4907} 4908 4909/// getVShift - Return a vector logical shift node. 4910/// 4911static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4912 unsigned NumBits, SelectionDAG &DAG, 4913 const TargetLowering &TLI, DebugLoc dl) { 4914 assert(VT.is128BitVector() && "Unknown type for VShift"); 4915 EVT ShVT = MVT::v2i64; 4916 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 4917 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4918 return DAG.getNode(ISD::BITCAST, dl, VT, 4919 DAG.getNode(Opc, dl, ShVT, SrcOp, 4920 DAG.getConstant(NumBits, 4921 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4922} 4923 4924SDValue 4925X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4926 SelectionDAG &DAG) const { 4927 4928 // Check if the scalar load can be widened into a vector load. And if 4929 // the address is "base + cst" see if the cst can be "absorbed" into 4930 // the shuffle mask. 4931 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4932 SDValue Ptr = LD->getBasePtr(); 4933 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4934 return SDValue(); 4935 EVT PVT = LD->getValueType(0); 4936 if (PVT != MVT::i32 && PVT != MVT::f32) 4937 return SDValue(); 4938 4939 int FI = -1; 4940 int64_t Offset = 0; 4941 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4942 FI = FINode->getIndex(); 4943 Offset = 0; 4944 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4945 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4946 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4947 Offset = Ptr.getConstantOperandVal(1); 4948 Ptr = Ptr.getOperand(0); 4949 } else { 4950 return SDValue(); 4951 } 4952 4953 // FIXME: 256-bit vector instructions don't require a strict alignment, 4954 // improve this code to support it better. 4955 unsigned RequiredAlign = VT.getSizeInBits()/8; 4956 SDValue Chain = LD->getChain(); 4957 // Make sure the stack object alignment is at least 16 or 32. 4958 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4959 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 4960 if (MFI->isFixedObjectIndex(FI)) { 4961 // Can't change the alignment. FIXME: It's possible to compute 4962 // the exact stack offset and reference FI + adjust offset instead. 4963 // If someone *really* cares about this. That's the way to implement it. 4964 return SDValue(); 4965 } else { 4966 MFI->setObjectAlignment(FI, RequiredAlign); 4967 } 4968 } 4969 4970 // (Offset % 16 or 32) must be multiple of 4. Then address is then 4971 // Ptr + (Offset & ~15). 4972 if (Offset < 0) 4973 return SDValue(); 4974 if ((Offset % RequiredAlign) & 3) 4975 return SDValue(); 4976 int64_t StartOffset = Offset & ~(RequiredAlign-1); 4977 if (StartOffset) 4978 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4979 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4980 4981 int EltNo = (Offset - StartOffset) >> 2; 4982 unsigned NumElems = VT.getVectorNumElements(); 4983 4984 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 4985 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 4986 LD->getPointerInfo().getWithOffset(StartOffset), 4987 false, false, false, 0); 4988 4989 SmallVector<int, 8> Mask; 4990 for (unsigned i = 0; i != NumElems; ++i) 4991 Mask.push_back(EltNo); 4992 4993 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 4994 } 4995 4996 return SDValue(); 4997} 4998 4999/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 5000/// vector of type 'VT', see if the elements can be replaced by a single large 5001/// load which has the same value as a build_vector whose operands are 'elts'. 5002/// 5003/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 5004/// 5005/// FIXME: we'd also like to handle the case where the last elements are zero 5006/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 5007/// There's even a handy isZeroNode for that purpose. 5008static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 5009 DebugLoc &DL, SelectionDAG &DAG) { 5010 EVT EltVT = VT.getVectorElementType(); 5011 unsigned NumElems = Elts.size(); 5012 5013 LoadSDNode *LDBase = NULL; 5014 unsigned LastLoadedElt = -1U; 5015 5016 // For each element in the initializer, see if we've found a load or an undef. 5017 // If we don't find an initial load element, or later load elements are 5018 // non-consecutive, bail out. 5019 for (unsigned i = 0; i < NumElems; ++i) { 5020 SDValue Elt = Elts[i]; 5021 5022 if (!Elt.getNode() || 5023 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 5024 return SDValue(); 5025 if (!LDBase) { 5026 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 5027 return SDValue(); 5028 LDBase = cast<LoadSDNode>(Elt.getNode()); 5029 LastLoadedElt = i; 5030 continue; 5031 } 5032 if (Elt.getOpcode() == ISD::UNDEF) 5033 continue; 5034 5035 LoadSDNode *LD = cast<LoadSDNode>(Elt); 5036 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 5037 return SDValue(); 5038 LastLoadedElt = i; 5039 } 5040 5041 // If we have found an entire vector of loads and undefs, then return a large 5042 // load of the entire vector width starting at the base pointer. If we found 5043 // consecutive loads for the low half, generate a vzext_load node. 5044 if (LastLoadedElt == NumElems - 1) { 5045 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 5046 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5047 LDBase->getPointerInfo(), 5048 LDBase->isVolatile(), LDBase->isNonTemporal(), 5049 LDBase->isInvariant(), 0); 5050 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 5051 LDBase->getPointerInfo(), 5052 LDBase->isVolatile(), LDBase->isNonTemporal(), 5053 LDBase->isInvariant(), LDBase->getAlignment()); 5054 } 5055 if (NumElems == 4 && LastLoadedElt == 1 && 5056 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 5057 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 5058 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 5059 SDValue ResNode = 5060 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64, 5061 LDBase->getPointerInfo(), 5062 LDBase->getAlignment(), 5063 false/*isVolatile*/, true/*ReadMem*/, 5064 false/*WriteMem*/); 5065 5066 // Make sure the newly-created LOAD is in the same position as LDBase in 5067 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and 5068 // update uses of LDBase's output chain to use the TokenFactor. 5069 if (LDBase->hasAnyUseOfValue(1)) { 5070 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 5071 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); 5072 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 5073 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 5074 SDValue(ResNode.getNode(), 1)); 5075 } 5076 5077 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 5078 } 5079 return SDValue(); 5080} 5081 5082/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction 5083/// to generate a splat value for the following cases: 5084/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 5085/// 2. A splat shuffle which uses a scalar_to_vector node which comes from 5086/// a scalar load, or a constant. 5087/// The VBROADCAST node is returned when a pattern is found, 5088/// or SDValue() otherwise. 5089SDValue 5090X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { 5091 if (!Subtarget->hasFp256()) 5092 return SDValue(); 5093 5094 EVT VT = Op.getValueType(); 5095 DebugLoc dl = Op.getDebugLoc(); 5096 5097 assert((VT.is128BitVector() || VT.is256BitVector()) && 5098 "Unsupported vector type for broadcast."); 5099 5100 SDValue Ld; 5101 bool ConstSplatVal; 5102 5103 switch (Op.getOpcode()) { 5104 default: 5105 // Unknown pattern found. 5106 return SDValue(); 5107 5108 case ISD::BUILD_VECTOR: { 5109 // The BUILD_VECTOR node must be a splat. 5110 if (!isSplatVector(Op.getNode())) 5111 return SDValue(); 5112 5113 Ld = Op.getOperand(0); 5114 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5115 Ld.getOpcode() == ISD::ConstantFP); 5116 5117 // The suspected load node has several users. Make sure that all 5118 // of its users are from the BUILD_VECTOR node. 5119 // Constants may have multiple users. 5120 if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) 5121 return SDValue(); 5122 break; 5123 } 5124 5125 case ISD::VECTOR_SHUFFLE: { 5126 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5127 5128 // Shuffles must have a splat mask where the first element is 5129 // broadcasted. 5130 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 5131 return SDValue(); 5132 5133 SDValue Sc = Op.getOperand(0); 5134 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 5135 Sc.getOpcode() != ISD::BUILD_VECTOR) { 5136 5137 if (!Subtarget->hasInt256()) 5138 return SDValue(); 5139 5140 // Use the register form of the broadcast instruction available on AVX2. 5141 if (VT.is256BitVector()) 5142 Sc = Extract128BitVector(Sc, 0, DAG, dl); 5143 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 5144 } 5145 5146 Ld = Sc.getOperand(0); 5147 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5148 Ld.getOpcode() == ISD::ConstantFP); 5149 5150 // The scalar_to_vector node and the suspected 5151 // load node must have exactly one user. 5152 // Constants may have multiple users. 5153 if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) 5154 return SDValue(); 5155 break; 5156 } 5157 } 5158 5159 bool Is256 = VT.is256BitVector(); 5160 5161 // Handle the broadcasting a single constant scalar from the constant pool 5162 // into a vector. On Sandybridge it is still better to load a constant vector 5163 // from the constant pool and not to broadcast it from a scalar. 5164 if (ConstSplatVal && Subtarget->hasInt256()) { 5165 EVT CVT = Ld.getValueType(); 5166 assert(!CVT.isVector() && "Must not broadcast a vector type"); 5167 unsigned ScalarSize = CVT.getSizeInBits(); 5168 5169 if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { 5170 const Constant *C = 0; 5171 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 5172 C = CI->getConstantIntValue(); 5173 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 5174 C = CF->getConstantFPValue(); 5175 5176 assert(C && "Invalid constant type"); 5177 5178 SDValue CP = DAG.getConstantPool(C, getPointerTy()); 5179 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 5180 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, 5181 MachinePointerInfo::getConstantPool(), 5182 false, false, false, Alignment); 5183 5184 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5185 } 5186 } 5187 5188 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 5189 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 5190 5191 // Handle AVX2 in-register broadcasts. 5192 if (!IsLoad && Subtarget->hasInt256() && 5193 (ScalarSize == 32 || (Is256 && ScalarSize == 64))) 5194 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5195 5196 // The scalar source must be a normal load. 5197 if (!IsLoad) 5198 return SDValue(); 5199 5200 if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) 5201 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5202 5203 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 5204 // double since there is no vbroadcastsd xmm 5205 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { 5206 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 5207 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5208 } 5209 5210 // Unsupported broadcast. 5211 return SDValue(); 5212} 5213 5214SDValue 5215X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { 5216 EVT VT = Op.getValueType(); 5217 5218 // Skip if insert_vec_elt is not supported. 5219 if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) 5220 return SDValue(); 5221 5222 DebugLoc DL = Op.getDebugLoc(); 5223 unsigned NumElems = Op.getNumOperands(); 5224 5225 SDValue VecIn1; 5226 SDValue VecIn2; 5227 SmallVector<unsigned, 4> InsertIndices; 5228 SmallVector<int, 8> Mask(NumElems, -1); 5229 5230 for (unsigned i = 0; i != NumElems; ++i) { 5231 unsigned Opc = Op.getOperand(i).getOpcode(); 5232 5233 if (Opc == ISD::UNDEF) 5234 continue; 5235 5236 if (Opc != ISD::EXTRACT_VECTOR_ELT) { 5237 // Quit if more than 1 elements need inserting. 5238 if (InsertIndices.size() > 1) 5239 return SDValue(); 5240 5241 InsertIndices.push_back(i); 5242 continue; 5243 } 5244 5245 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); 5246 SDValue ExtIdx = Op.getOperand(i).getOperand(1); 5247 5248 // Quit if extracted from vector of different type. 5249 if (ExtractedFromVec.getValueType() != VT) 5250 return SDValue(); 5251 5252 // Quit if non-constant index. 5253 if (!isa<ConstantSDNode>(ExtIdx)) 5254 return SDValue(); 5255 5256 if (VecIn1.getNode() == 0) 5257 VecIn1 = ExtractedFromVec; 5258 else if (VecIn1 != ExtractedFromVec) { 5259 if (VecIn2.getNode() == 0) 5260 VecIn2 = ExtractedFromVec; 5261 else if (VecIn2 != ExtractedFromVec) 5262 // Quit if more than 2 vectors to shuffle 5263 return SDValue(); 5264 } 5265 5266 unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); 5267 5268 if (ExtractedFromVec == VecIn1) 5269 Mask[i] = Idx; 5270 else if (ExtractedFromVec == VecIn2) 5271 Mask[i] = Idx + NumElems; 5272 } 5273 5274 if (VecIn1.getNode() == 0) 5275 return SDValue(); 5276 5277 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); 5278 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); 5279 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { 5280 unsigned Idx = InsertIndices[i]; 5281 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), 5282 DAG.getIntPtrConstant(Idx)); 5283 } 5284 5285 return NV; 5286} 5287 5288SDValue 5289X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5290 DebugLoc dl = Op.getDebugLoc(); 5291 5292 EVT VT = Op.getValueType(); 5293 EVT ExtVT = VT.getVectorElementType(); 5294 unsigned NumElems = Op.getNumOperands(); 5295 5296 // Vectors containing all zeros can be matched by pxor and xorps later 5297 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5298 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 5299 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 5300 if (VT == MVT::v4i32 || VT == MVT::v8i32) 5301 return Op; 5302 5303 return getZeroVector(VT, Subtarget, DAG, dl); 5304 } 5305 5306 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 5307 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 5308 // vpcmpeqd on 256-bit vectors. 5309 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 5310 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) 5311 return Op; 5312 5313 return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); 5314 } 5315 5316 SDValue Broadcast = LowerVectorBroadcast(Op, DAG); 5317 if (Broadcast.getNode()) 5318 return Broadcast; 5319 5320 unsigned EVTBits = ExtVT.getSizeInBits(); 5321 5322 unsigned NumZero = 0; 5323 unsigned NumNonZero = 0; 5324 unsigned NonZeros = 0; 5325 bool IsAllConstants = true; 5326 SmallSet<SDValue, 8> Values; 5327 for (unsigned i = 0; i < NumElems; ++i) { 5328 SDValue Elt = Op.getOperand(i); 5329 if (Elt.getOpcode() == ISD::UNDEF) 5330 continue; 5331 Values.insert(Elt); 5332 if (Elt.getOpcode() != ISD::Constant && 5333 Elt.getOpcode() != ISD::ConstantFP) 5334 IsAllConstants = false; 5335 if (X86::isZeroNode(Elt)) 5336 NumZero++; 5337 else { 5338 NonZeros |= (1 << i); 5339 NumNonZero++; 5340 } 5341 } 5342 5343 // All undef vector. Return an UNDEF. All zero vectors were handled above. 5344 if (NumNonZero == 0) 5345 return DAG.getUNDEF(VT); 5346 5347 // Special case for single non-zero, non-undef, element. 5348 if (NumNonZero == 1) { 5349 unsigned Idx = CountTrailingZeros_32(NonZeros); 5350 SDValue Item = Op.getOperand(Idx); 5351 5352 // If this is an insertion of an i64 value on x86-32, and if the top bits of 5353 // the value are obviously zero, truncate the value to i32 and do the 5354 // insertion that way. Only do this if the value is non-constant or if the 5355 // value is a constant being inserted into element 0. It is cheaper to do 5356 // a constant pool load than it is to do a movd + shuffle. 5357 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5358 (!IsAllConstants || Idx == 0)) { 5359 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5360 // Handle SSE only. 5361 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5362 EVT VecVT = MVT::v4i32; 5363 unsigned VecElts = 4; 5364 5365 // Truncate the value (which may itself be a constant) to i32, and 5366 // convert it to a vector with movd (S2V+shuffle to zero extend). 5367 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5368 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5369 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5370 5371 // Now we have our 32-bit value zero extended in the low element of 5372 // a vector. If Idx != 0, swizzle it into place. 5373 if (Idx != 0) { 5374 SmallVector<int, 4> Mask; 5375 Mask.push_back(Idx); 5376 for (unsigned i = 1; i != VecElts; ++i) 5377 Mask.push_back(i); 5378 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), 5379 &Mask[0]); 5380 } 5381 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5382 } 5383 } 5384 5385 // If we have a constant or non-constant insertion into the low element of 5386 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5387 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5388 // depending on what the source datatype is. 5389 if (Idx == 0) { 5390 if (NumZero == 0) 5391 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5392 5393 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5394 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5395 if (VT.is256BitVector()) { 5396 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 5397 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 5398 Item, DAG.getIntPtrConstant(0)); 5399 } 5400 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5401 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5402 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5403 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5404 } 5405 5406 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5407 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5408 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 5409 if (VT.is256BitVector()) { 5410 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 5411 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); 5412 } else { 5413 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5414 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5415 } 5416 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5417 } 5418 } 5419 5420 // Is it a vector logical left shift? 5421 if (NumElems == 2 && Idx == 1 && 5422 X86::isZeroNode(Op.getOperand(0)) && 5423 !X86::isZeroNode(Op.getOperand(1))) { 5424 unsigned NumBits = VT.getSizeInBits(); 5425 return getVShift(true, VT, 5426 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5427 VT, Op.getOperand(1)), 5428 NumBits/2, DAG, *this, dl); 5429 } 5430 5431 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5432 return SDValue(); 5433 5434 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5435 // is a non-constant being inserted into an element other than the low one, 5436 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5437 // movd/movss) to move this into the low element, then shuffle it into 5438 // place. 5439 if (EVTBits == 32) { 5440 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5441 5442 // Turn it into a shuffle of zero and zero-extended scalar to vector. 5443 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); 5444 SmallVector<int, 8> MaskVec; 5445 for (unsigned i = 0; i != NumElems; ++i) 5446 MaskVec.push_back(i == Idx ? 0 : 1); 5447 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 5448 } 5449 } 5450 5451 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5452 if (Values.size() == 1) { 5453 if (EVTBits == 32) { 5454 // Instead of a shuffle like this: 5455 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5456 // Check if it's possible to issue this instead. 5457 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5458 unsigned Idx = CountTrailingZeros_32(NonZeros); 5459 SDValue Item = Op.getOperand(Idx); 5460 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5461 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5462 } 5463 return SDValue(); 5464 } 5465 5466 // A vector full of immediates; various special cases are already 5467 // handled, so this is best done with a single constant-pool load. 5468 if (IsAllConstants) 5469 return SDValue(); 5470 5471 // For AVX-length vectors, build the individual 128-bit pieces and use 5472 // shuffles to put them in place. 5473 if (VT.is256BitVector()) { 5474 SmallVector<SDValue, 32> V; 5475 for (unsigned i = 0; i != NumElems; ++i) 5476 V.push_back(Op.getOperand(i)); 5477 5478 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 5479 5480 // Build both the lower and upper subvector. 5481 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 5482 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 5483 NumElems/2); 5484 5485 // Recreate the wider vector with the lower and upper part. 5486 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 5487 } 5488 5489 // Let legalizer expand 2-wide build_vectors. 5490 if (EVTBits == 64) { 5491 if (NumNonZero == 1) { 5492 // One half is zero or undef. 5493 unsigned Idx = CountTrailingZeros_32(NonZeros); 5494 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 5495 Op.getOperand(Idx)); 5496 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 5497 } 5498 return SDValue(); 5499 } 5500 5501 // If element VT is < 32 bits, convert it to inserts into a zero vector. 5502 if (EVTBits == 8 && NumElems == 16) { 5503 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 5504 Subtarget, *this); 5505 if (V.getNode()) return V; 5506 } 5507 5508 if (EVTBits == 16 && NumElems == 8) { 5509 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 5510 Subtarget, *this); 5511 if (V.getNode()) return V; 5512 } 5513 5514 // If element VT is == 32 bits, turn it into a number of shuffles. 5515 SmallVector<SDValue, 8> V(NumElems); 5516 if (NumElems == 4 && NumZero > 0) { 5517 for (unsigned i = 0; i < 4; ++i) { 5518 bool isZero = !(NonZeros & (1 << i)); 5519 if (isZero) 5520 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 5521 else 5522 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5523 } 5524 5525 for (unsigned i = 0; i < 2; ++i) { 5526 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 5527 default: break; 5528 case 0: 5529 V[i] = V[i*2]; // Must be a zero vector. 5530 break; 5531 case 1: 5532 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 5533 break; 5534 case 2: 5535 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 5536 break; 5537 case 3: 5538 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 5539 break; 5540 } 5541 } 5542 5543 bool Reverse1 = (NonZeros & 0x3) == 2; 5544 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 5545 int MaskVec[] = { 5546 Reverse1 ? 1 : 0, 5547 Reverse1 ? 0 : 1, 5548 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 5549 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 5550 }; 5551 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 5552 } 5553 5554 if (Values.size() > 1 && VT.is128BitVector()) { 5555 // Check for a build vector of consecutive loads. 5556 for (unsigned i = 0; i < NumElems; ++i) 5557 V[i] = Op.getOperand(i); 5558 5559 // Check for elements which are consecutive loads. 5560 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 5561 if (LD.getNode()) 5562 return LD; 5563 5564 // Check for a build vector from mostly shuffle plus few inserting. 5565 SDValue Sh = buildFromShuffleMostly(Op, DAG); 5566 if (Sh.getNode()) 5567 return Sh; 5568 5569 // For SSE 4.1, use insertps to put the high elements into the low element. 5570 if (getSubtarget()->hasSSE41()) { 5571 SDValue Result; 5572 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 5573 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 5574 else 5575 Result = DAG.getUNDEF(VT); 5576 5577 for (unsigned i = 1; i < NumElems; ++i) { 5578 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 5579 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 5580 Op.getOperand(i), DAG.getIntPtrConstant(i)); 5581 } 5582 return Result; 5583 } 5584 5585 // Otherwise, expand into a number of unpckl*, start by extending each of 5586 // our (non-undef) elements to the full vector width with the element in the 5587 // bottom slot of the vector (which generates no code for SSE). 5588 for (unsigned i = 0; i < NumElems; ++i) { 5589 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 5590 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5591 else 5592 V[i] = DAG.getUNDEF(VT); 5593 } 5594 5595 // Next, we iteratively mix elements, e.g. for v4f32: 5596 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 5597 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 5598 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 5599 unsigned EltStride = NumElems >> 1; 5600 while (EltStride != 0) { 5601 for (unsigned i = 0; i < EltStride; ++i) { 5602 // If V[i+EltStride] is undef and this is the first round of mixing, 5603 // then it is safe to just drop this shuffle: V[i] is already in the 5604 // right place, the one element (since it's the first round) being 5605 // inserted as undef can be dropped. This isn't safe for successive 5606 // rounds because they will permute elements within both vectors. 5607 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 5608 EltStride == NumElems/2) 5609 continue; 5610 5611 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 5612 } 5613 EltStride >>= 1; 5614 } 5615 return V[0]; 5616 } 5617 return SDValue(); 5618} 5619 5620// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 5621// to create 256-bit vectors from two other 128-bit ones. 5622static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5623 DebugLoc dl = Op.getDebugLoc(); 5624 EVT ResVT = Op.getValueType(); 5625 5626 assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); 5627 5628 SDValue V1 = Op.getOperand(0); 5629 SDValue V2 = Op.getOperand(1); 5630 unsigned NumElems = ResVT.getVectorNumElements(); 5631 5632 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 5633} 5634 5635static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5636 assert(Op.getNumOperands() == 2); 5637 5638 // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors 5639 // from two other 128-bit ones. 5640 return LowerAVXCONCAT_VECTORS(Op, DAG); 5641} 5642 5643// Try to lower a shuffle node into a simple blend instruction. 5644static SDValue 5645LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, 5646 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 5647 SDValue V1 = SVOp->getOperand(0); 5648 SDValue V2 = SVOp->getOperand(1); 5649 DebugLoc dl = SVOp->getDebugLoc(); 5650 EVT VT = SVOp->getValueType(0); 5651 EVT EltVT = VT.getVectorElementType(); 5652 unsigned NumElems = VT.getVectorNumElements(); 5653 5654 if (!Subtarget->hasSSE41() || EltVT == MVT::i8) 5655 return SDValue(); 5656 if (!Subtarget->hasInt256() && VT == MVT::v16i16) 5657 return SDValue(); 5658 5659 // Check the mask for BLEND and build the value. 5660 unsigned MaskValue = 0; 5661 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. 5662 unsigned NumLanes = (NumElems-1)/8 + 1; 5663 unsigned NumElemsInLane = NumElems / NumLanes; 5664 5665 // Blend for v16i16 should be symetric for the both lanes. 5666 for (unsigned i = 0; i < NumElemsInLane; ++i) { 5667 5668 int SndLaneEltIdx = (NumLanes == 2) ? 5669 SVOp->getMaskElt(i + NumElemsInLane) : -1; 5670 int EltIdx = SVOp->getMaskElt(i); 5671 5672 if ((EltIdx == -1 || EltIdx == (int)i) && 5673 (SndLaneEltIdx == -1 || SndLaneEltIdx == (int)(i + NumElemsInLane))) 5674 continue; 5675 5676 if (((unsigned)EltIdx == (i + NumElems)) && 5677 (SndLaneEltIdx == -1 || 5678 (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) 5679 MaskValue |= (1<<i); 5680 else 5681 return SDValue(); 5682 } 5683 5684 // Convert i32 vectors to floating point if it is not AVX2. 5685 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. 5686 EVT BlendVT = VT; 5687 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { 5688 BlendVT = EVT::getVectorVT(*DAG.getContext(), 5689 EVT::getFloatingPointVT(EltVT.getSizeInBits()), 5690 NumElems); 5691 V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); 5692 V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); 5693 } 5694 5695 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, 5696 DAG.getConstant(MaskValue, MVT::i32)); 5697 return DAG.getNode(ISD::BITCAST, dl, VT, Ret); 5698} 5699 5700// v8i16 shuffles - Prefer shuffles in the following order: 5701// 1. [all] pshuflw, pshufhw, optional move 5702// 2. [ssse3] 1 x pshufb 5703// 3. [ssse3] 2 x pshufb + 1 x por 5704// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 5705static SDValue 5706LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, 5707 SelectionDAG &DAG) { 5708 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5709 SDValue V1 = SVOp->getOperand(0); 5710 SDValue V2 = SVOp->getOperand(1); 5711 DebugLoc dl = SVOp->getDebugLoc(); 5712 SmallVector<int, 8> MaskVals; 5713 5714 // Determine if more than 1 of the words in each of the low and high quadwords 5715 // of the result come from the same quadword of one of the two inputs. Undef 5716 // mask values count as coming from any quadword, for better codegen. 5717 unsigned LoQuad[] = { 0, 0, 0, 0 }; 5718 unsigned HiQuad[] = { 0, 0, 0, 0 }; 5719 std::bitset<4> InputQuads; 5720 for (unsigned i = 0; i < 8; ++i) { 5721 unsigned *Quad = i < 4 ? LoQuad : HiQuad; 5722 int EltIdx = SVOp->getMaskElt(i); 5723 MaskVals.push_back(EltIdx); 5724 if (EltIdx < 0) { 5725 ++Quad[0]; 5726 ++Quad[1]; 5727 ++Quad[2]; 5728 ++Quad[3]; 5729 continue; 5730 } 5731 ++Quad[EltIdx / 4]; 5732 InputQuads.set(EltIdx / 4); 5733 } 5734 5735 int BestLoQuad = -1; 5736 unsigned MaxQuad = 1; 5737 for (unsigned i = 0; i < 4; ++i) { 5738 if (LoQuad[i] > MaxQuad) { 5739 BestLoQuad = i; 5740 MaxQuad = LoQuad[i]; 5741 } 5742 } 5743 5744 int BestHiQuad = -1; 5745 MaxQuad = 1; 5746 for (unsigned i = 0; i < 4; ++i) { 5747 if (HiQuad[i] > MaxQuad) { 5748 BestHiQuad = i; 5749 MaxQuad = HiQuad[i]; 5750 } 5751 } 5752 5753 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 5754 // of the two input vectors, shuffle them into one input vector so only a 5755 // single pshufb instruction is necessary. If There are more than 2 input 5756 // quads, disable the next transformation since it does not help SSSE3. 5757 bool V1Used = InputQuads[0] || InputQuads[1]; 5758 bool V2Used = InputQuads[2] || InputQuads[3]; 5759 if (Subtarget->hasSSSE3()) { 5760 if (InputQuads.count() == 2 && V1Used && V2Used) { 5761 BestLoQuad = InputQuads[0] ? 0 : 1; 5762 BestHiQuad = InputQuads[2] ? 2 : 3; 5763 } 5764 if (InputQuads.count() > 2) { 5765 BestLoQuad = -1; 5766 BestHiQuad = -1; 5767 } 5768 } 5769 5770 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 5771 // the shuffle mask. If a quad is scored as -1, that means that it contains 5772 // words from all 4 input quadwords. 5773 SDValue NewV; 5774 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 5775 int MaskV[] = { 5776 BestLoQuad < 0 ? 0 : BestLoQuad, 5777 BestHiQuad < 0 ? 1 : BestHiQuad 5778 }; 5779 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 5780 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 5781 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 5782 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 5783 5784 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 5785 // source words for the shuffle, to aid later transformations. 5786 bool AllWordsInNewV = true; 5787 bool InOrder[2] = { true, true }; 5788 for (unsigned i = 0; i != 8; ++i) { 5789 int idx = MaskVals[i]; 5790 if (idx != (int)i) 5791 InOrder[i/4] = false; 5792 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 5793 continue; 5794 AllWordsInNewV = false; 5795 break; 5796 } 5797 5798 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5799 if (AllWordsInNewV) { 5800 for (int i = 0; i != 8; ++i) { 5801 int idx = MaskVals[i]; 5802 if (idx < 0) 5803 continue; 5804 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5805 if ((idx != i) && idx < 4) 5806 pshufhw = false; 5807 if ((idx != i) && idx > 3) 5808 pshuflw = false; 5809 } 5810 V1 = NewV; 5811 V2Used = false; 5812 BestLoQuad = 0; 5813 BestHiQuad = 1; 5814 } 5815 5816 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5817 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5818 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5819 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5820 unsigned TargetMask = 0; 5821 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5822 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5823 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5824 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): 5825 getShufflePSHUFLWImmediate(SVOp); 5826 V1 = NewV.getOperand(0); 5827 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5828 } 5829 } 5830 5831 // If we have SSSE3, and all words of the result are from 1 input vector, 5832 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5833 // is present, fall back to case 4. 5834 if (Subtarget->hasSSSE3()) { 5835 SmallVector<SDValue,16> pshufbMask; 5836 5837 // If we have elements from both input vectors, set the high bit of the 5838 // shuffle mask element to zero out elements that come from V2 in the V1 5839 // mask, and elements that come from V1 in the V2 mask, so that the two 5840 // results can be OR'd together. 5841 bool TwoInputs = V1Used && V2Used; 5842 for (unsigned i = 0; i != 8; ++i) { 5843 int EltIdx = MaskVals[i] * 2; 5844 int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; 5845 int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; 5846 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 5847 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 5848 } 5849 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5850 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5851 DAG.getNode(ISD::BUILD_VECTOR, dl, 5852 MVT::v16i8, &pshufbMask[0], 16)); 5853 if (!TwoInputs) 5854 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5855 5856 // Calculate the shuffle mask for the second input, shuffle it, and 5857 // OR it with the first shuffled input. 5858 pshufbMask.clear(); 5859 for (unsigned i = 0; i != 8; ++i) { 5860 int EltIdx = MaskVals[i] * 2; 5861 int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; 5862 int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; 5863 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 5864 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 5865 } 5866 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5867 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5868 DAG.getNode(ISD::BUILD_VECTOR, dl, 5869 MVT::v16i8, &pshufbMask[0], 16)); 5870 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5871 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5872 } 5873 5874 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5875 // and update MaskVals with new element order. 5876 std::bitset<8> InOrder; 5877 if (BestLoQuad >= 0) { 5878 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; 5879 for (int i = 0; i != 4; ++i) { 5880 int idx = MaskVals[i]; 5881 if (idx < 0) { 5882 InOrder.set(i); 5883 } else if ((idx / 4) == BestLoQuad) { 5884 MaskV[i] = idx & 3; 5885 InOrder.set(i); 5886 } 5887 } 5888 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5889 &MaskV[0]); 5890 5891 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 5892 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5893 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5894 NewV.getOperand(0), 5895 getShufflePSHUFLWImmediate(SVOp), DAG); 5896 } 5897 } 5898 5899 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5900 // and update MaskVals with the new element order. 5901 if (BestHiQuad >= 0) { 5902 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; 5903 for (unsigned i = 4; i != 8; ++i) { 5904 int idx = MaskVals[i]; 5905 if (idx < 0) { 5906 InOrder.set(i); 5907 } else if ((idx / 4) == BestHiQuad) { 5908 MaskV[i] = (idx & 3) + 4; 5909 InOrder.set(i); 5910 } 5911 } 5912 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5913 &MaskV[0]); 5914 5915 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 5916 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5917 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5918 NewV.getOperand(0), 5919 getShufflePSHUFHWImmediate(SVOp), DAG); 5920 } 5921 } 5922 5923 // In case BestHi & BestLo were both -1, which means each quadword has a word 5924 // from each of the four input quadwords, calculate the InOrder bitvector now 5925 // before falling through to the insert/extract cleanup. 5926 if (BestLoQuad == -1 && BestHiQuad == -1) { 5927 NewV = V1; 5928 for (int i = 0; i != 8; ++i) 5929 if (MaskVals[i] < 0 || MaskVals[i] == i) 5930 InOrder.set(i); 5931 } 5932 5933 // The other elements are put in the right place using pextrw and pinsrw. 5934 for (unsigned i = 0; i != 8; ++i) { 5935 if (InOrder[i]) 5936 continue; 5937 int EltIdx = MaskVals[i]; 5938 if (EltIdx < 0) 5939 continue; 5940 SDValue ExtOp = (EltIdx < 8) ? 5941 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5942 DAG.getIntPtrConstant(EltIdx)) : 5943 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 5944 DAG.getIntPtrConstant(EltIdx - 8)); 5945 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 5946 DAG.getIntPtrConstant(i)); 5947 } 5948 return NewV; 5949} 5950 5951// v16i8 shuffles - Prefer shuffles in the following order: 5952// 1. [ssse3] 1 x pshufb 5953// 2. [ssse3] 2 x pshufb + 1 x por 5954// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 5955static 5956SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 5957 SelectionDAG &DAG, 5958 const X86TargetLowering &TLI) { 5959 SDValue V1 = SVOp->getOperand(0); 5960 SDValue V2 = SVOp->getOperand(1); 5961 DebugLoc dl = SVOp->getDebugLoc(); 5962 ArrayRef<int> MaskVals = SVOp->getMask(); 5963 5964 // If we have SSSE3, case 1 is generated when all result bytes come from 5965 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5966 // present, fall back to case 3. 5967 5968 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5969 if (TLI.getSubtarget()->hasSSSE3()) { 5970 SmallVector<SDValue,16> pshufbMask; 5971 5972 // If all result elements are from one input vector, then only translate 5973 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5974 // 5975 // Otherwise, we have elements from both input vectors, and must zero out 5976 // elements that come from V2 in the first mask, and V1 in the second mask 5977 // so that we can OR them together. 5978 for (unsigned i = 0; i != 16; ++i) { 5979 int EltIdx = MaskVals[i]; 5980 if (EltIdx < 0 || EltIdx >= 16) 5981 EltIdx = 0x80; 5982 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5983 } 5984 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5985 DAG.getNode(ISD::BUILD_VECTOR, dl, 5986 MVT::v16i8, &pshufbMask[0], 16)); 5987 5988 // As PSHUFB will zero elements with negative indices, it's safe to ignore 5989 // the 2nd operand if it's undefined or zero. 5990 if (V2.getOpcode() == ISD::UNDEF || 5991 ISD::isBuildVectorAllZeros(V2.getNode())) 5992 return V1; 5993 5994 // Calculate the shuffle mask for the second input, shuffle it, and 5995 // OR it with the first shuffled input. 5996 pshufbMask.clear(); 5997 for (unsigned i = 0; i != 16; ++i) { 5998 int EltIdx = MaskVals[i]; 5999 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; 6000 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6001 } 6002 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 6003 DAG.getNode(ISD::BUILD_VECTOR, dl, 6004 MVT::v16i8, &pshufbMask[0], 16)); 6005 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 6006 } 6007 6008 // No SSSE3 - Calculate in place words and then fix all out of place words 6009 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 6010 // the 16 different words that comprise the two doublequadword input vectors. 6011 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 6012 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 6013 SDValue NewV = V1; 6014 for (int i = 0; i != 8; ++i) { 6015 int Elt0 = MaskVals[i*2]; 6016 int Elt1 = MaskVals[i*2+1]; 6017 6018 // This word of the result is all undef, skip it. 6019 if (Elt0 < 0 && Elt1 < 0) 6020 continue; 6021 6022 // This word of the result is already in the correct place, skip it. 6023 if ((Elt0 == i*2) && (Elt1 == i*2+1)) 6024 continue; 6025 6026 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 6027 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 6028 SDValue InsElt; 6029 6030 // If Elt0 and Elt1 are defined, are consecutive, and can be load 6031 // using a single extract together, load it and store it. 6032 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 6033 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 6034 DAG.getIntPtrConstant(Elt1 / 2)); 6035 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 6036 DAG.getIntPtrConstant(i)); 6037 continue; 6038 } 6039 6040 // If Elt1 is defined, extract it from the appropriate source. If the 6041 // source byte is not also odd, shift the extracted word left 8 bits 6042 // otherwise clear the bottom 8 bits if we need to do an or. 6043 if (Elt1 >= 0) { 6044 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 6045 DAG.getIntPtrConstant(Elt1 / 2)); 6046 if ((Elt1 & 1) == 0) 6047 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 6048 DAG.getConstant(8, 6049 TLI.getShiftAmountTy(InsElt.getValueType()))); 6050 else if (Elt0 >= 0) 6051 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 6052 DAG.getConstant(0xFF00, MVT::i16)); 6053 } 6054 // If Elt0 is defined, extract it from the appropriate source. If the 6055 // source byte is not also even, shift the extracted word right 8 bits. If 6056 // Elt1 was also defined, OR the extracted values together before 6057 // inserting them in the result. 6058 if (Elt0 >= 0) { 6059 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 6060 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 6061 if ((Elt0 & 1) != 0) 6062 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 6063 DAG.getConstant(8, 6064 TLI.getShiftAmountTy(InsElt0.getValueType()))); 6065 else if (Elt1 >= 0) 6066 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 6067 DAG.getConstant(0x00FF, MVT::i16)); 6068 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 6069 : InsElt0; 6070 } 6071 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 6072 DAG.getIntPtrConstant(i)); 6073 } 6074 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 6075} 6076 6077// v32i8 shuffles - Translate to VPSHUFB if possible. 6078static 6079SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, 6080 const X86Subtarget *Subtarget, 6081 SelectionDAG &DAG) { 6082 EVT VT = SVOp->getValueType(0); 6083 SDValue V1 = SVOp->getOperand(0); 6084 SDValue V2 = SVOp->getOperand(1); 6085 DebugLoc dl = SVOp->getDebugLoc(); 6086 SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); 6087 6088 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6089 bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); 6090 bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); 6091 6092 // VPSHUFB may be generated if 6093 // (1) one of input vector is undefined or zeroinitializer. 6094 // The mask value 0x80 puts 0 in the corresponding slot of the vector. 6095 // And (2) the mask indexes don't cross the 128-bit lane. 6096 if (VT != MVT::v32i8 || !Subtarget->hasInt256() || 6097 (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) 6098 return SDValue(); 6099 6100 if (V1IsAllZero && !V2IsAllZero) { 6101 CommuteVectorShuffleMask(MaskVals, 32); 6102 V1 = V2; 6103 } 6104 SmallVector<SDValue, 32> pshufbMask; 6105 for (unsigned i = 0; i != 32; i++) { 6106 int EltIdx = MaskVals[i]; 6107 if (EltIdx < 0 || EltIdx >= 32) 6108 EltIdx = 0x80; 6109 else { 6110 if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) 6111 // Cross lane is not allowed. 6112 return SDValue(); 6113 EltIdx &= 0xf; 6114 } 6115 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 6116 } 6117 return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, 6118 DAG.getNode(ISD::BUILD_VECTOR, dl, 6119 MVT::v32i8, &pshufbMask[0], 32)); 6120} 6121 6122/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 6123/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 6124/// done when every pair / quad of shuffle mask elements point to elements in 6125/// the right sequence. e.g. 6126/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 6127static 6128SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 6129 SelectionDAG &DAG, DebugLoc dl) { 6130 MVT VT = SVOp->getValueType(0).getSimpleVT(); 6131 unsigned NumElems = VT.getVectorNumElements(); 6132 MVT NewVT; 6133 unsigned Scale; 6134 switch (VT.SimpleTy) { 6135 default: llvm_unreachable("Unexpected!"); 6136 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; 6137 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; 6138 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; 6139 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; 6140 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; 6141 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; 6142 } 6143 6144 SmallVector<int, 8> MaskVec; 6145 for (unsigned i = 0; i != NumElems; i += Scale) { 6146 int StartIdx = -1; 6147 for (unsigned j = 0; j != Scale; ++j) { 6148 int EltIdx = SVOp->getMaskElt(i+j); 6149 if (EltIdx < 0) 6150 continue; 6151 if (StartIdx < 0) 6152 StartIdx = (EltIdx / Scale); 6153 if (EltIdx != (int)(StartIdx*Scale + j)) 6154 return SDValue(); 6155 } 6156 MaskVec.push_back(StartIdx); 6157 } 6158 6159 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); 6160 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); 6161 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 6162} 6163 6164/// getVZextMovL - Return a zero-extending vector move low node. 6165/// 6166static SDValue getVZextMovL(EVT VT, EVT OpVT, 6167 SDValue SrcOp, SelectionDAG &DAG, 6168 const X86Subtarget *Subtarget, DebugLoc dl) { 6169 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 6170 LoadSDNode *LD = NULL; 6171 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 6172 LD = dyn_cast<LoadSDNode>(SrcOp); 6173 if (!LD) { 6174 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 6175 // instead. 6176 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 6177 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 6178 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 6179 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 6180 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 6181 // PR2108 6182 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 6183 return DAG.getNode(ISD::BITCAST, dl, VT, 6184 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6185 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6186 OpVT, 6187 SrcOp.getOperand(0) 6188 .getOperand(0)))); 6189 } 6190 } 6191 } 6192 6193 return DAG.getNode(ISD::BITCAST, dl, VT, 6194 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6195 DAG.getNode(ISD::BITCAST, dl, 6196 OpVT, SrcOp))); 6197} 6198 6199/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 6200/// which could not be matched by any known target speficic shuffle 6201static SDValue 6202LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6203 6204 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); 6205 if (NewOp.getNode()) 6206 return NewOp; 6207 6208 EVT VT = SVOp->getValueType(0); 6209 6210 unsigned NumElems = VT.getVectorNumElements(); 6211 unsigned NumLaneElems = NumElems / 2; 6212 6213 DebugLoc dl = SVOp->getDebugLoc(); 6214 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 6215 EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); 6216 SDValue Output[2]; 6217 6218 SmallVector<int, 16> Mask; 6219 for (unsigned l = 0; l < 2; ++l) { 6220 // Build a shuffle mask for the output, discovering on the fly which 6221 // input vectors to use as shuffle operands (recorded in InputUsed). 6222 // If building a suitable shuffle vector proves too hard, then bail 6223 // out with UseBuildVector set. 6224 bool UseBuildVector = false; 6225 int InputUsed[2] = { -1, -1 }; // Not yet discovered. 6226 unsigned LaneStart = l * NumLaneElems; 6227 for (unsigned i = 0; i != NumLaneElems; ++i) { 6228 // The mask element. This indexes into the input. 6229 int Idx = SVOp->getMaskElt(i+LaneStart); 6230 if (Idx < 0) { 6231 // the mask element does not index into any input vector. 6232 Mask.push_back(-1); 6233 continue; 6234 } 6235 6236 // The input vector this mask element indexes into. 6237 int Input = Idx / NumLaneElems; 6238 6239 // Turn the index into an offset from the start of the input vector. 6240 Idx -= Input * NumLaneElems; 6241 6242 // Find or create a shuffle vector operand to hold this input. 6243 unsigned OpNo; 6244 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { 6245 if (InputUsed[OpNo] == Input) 6246 // This input vector is already an operand. 6247 break; 6248 if (InputUsed[OpNo] < 0) { 6249 // Create a new operand for this input vector. 6250 InputUsed[OpNo] = Input; 6251 break; 6252 } 6253 } 6254 6255 if (OpNo >= array_lengthof(InputUsed)) { 6256 // More than two input vectors used! Give up on trying to create a 6257 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 6258 UseBuildVector = true; 6259 break; 6260 } 6261 6262 // Add the mask index for the new shuffle vector. 6263 Mask.push_back(Idx + OpNo * NumLaneElems); 6264 } 6265 6266 if (UseBuildVector) { 6267 SmallVector<SDValue, 16> SVOps; 6268 for (unsigned i = 0; i != NumLaneElems; ++i) { 6269 // The mask element. This indexes into the input. 6270 int Idx = SVOp->getMaskElt(i+LaneStart); 6271 if (Idx < 0) { 6272 SVOps.push_back(DAG.getUNDEF(EltVT)); 6273 continue; 6274 } 6275 6276 // The input vector this mask element indexes into. 6277 int Input = Idx / NumElems; 6278 6279 // Turn the index into an offset from the start of the input vector. 6280 Idx -= Input * NumElems; 6281 6282 // Extract the vector element by hand. 6283 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 6284 SVOp->getOperand(Input), 6285 DAG.getIntPtrConstant(Idx))); 6286 } 6287 6288 // Construct the output using a BUILD_VECTOR. 6289 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], 6290 SVOps.size()); 6291 } else if (InputUsed[0] < 0) { 6292 // No input vectors were used! The result is undefined. 6293 Output[l] = DAG.getUNDEF(NVT); 6294 } else { 6295 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), 6296 (InputUsed[0] % 2) * NumLaneElems, 6297 DAG, dl); 6298 // If only one input was used, use an undefined vector for the other. 6299 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : 6300 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), 6301 (InputUsed[1] % 2) * NumLaneElems, DAG, dl); 6302 // At least one input vector was used. Create a new shuffle vector. 6303 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); 6304 } 6305 6306 Mask.clear(); 6307 } 6308 6309 // Concatenate the result back 6310 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); 6311} 6312 6313/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 6314/// 4 elements, and match them with several different shuffle types. 6315static SDValue 6316LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6317 SDValue V1 = SVOp->getOperand(0); 6318 SDValue V2 = SVOp->getOperand(1); 6319 DebugLoc dl = SVOp->getDebugLoc(); 6320 EVT VT = SVOp->getValueType(0); 6321 6322 assert(VT.is128BitVector() && "Unsupported vector size"); 6323 6324 std::pair<int, int> Locs[4]; 6325 int Mask1[] = { -1, -1, -1, -1 }; 6326 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); 6327 6328 unsigned NumHi = 0; 6329 unsigned NumLo = 0; 6330 for (unsigned i = 0; i != 4; ++i) { 6331 int Idx = PermMask[i]; 6332 if (Idx < 0) { 6333 Locs[i] = std::make_pair(-1, -1); 6334 } else { 6335 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 6336 if (Idx < 4) { 6337 Locs[i] = std::make_pair(0, NumLo); 6338 Mask1[NumLo] = Idx; 6339 NumLo++; 6340 } else { 6341 Locs[i] = std::make_pair(1, NumHi); 6342 if (2+NumHi < 4) 6343 Mask1[2+NumHi] = Idx; 6344 NumHi++; 6345 } 6346 } 6347 } 6348 6349 if (NumLo <= 2 && NumHi <= 2) { 6350 // If no more than two elements come from either vector. This can be 6351 // implemented with two shuffles. First shuffle gather the elements. 6352 // The second shuffle, which takes the first shuffle as both of its 6353 // vector operands, put the elements into the right order. 6354 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6355 6356 int Mask2[] = { -1, -1, -1, -1 }; 6357 6358 for (unsigned i = 0; i != 4; ++i) 6359 if (Locs[i].first != -1) { 6360 unsigned Idx = (i < 2) ? 0 : 4; 6361 Idx += Locs[i].first * 2 + Locs[i].second; 6362 Mask2[i] = Idx; 6363 } 6364 6365 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 6366 } 6367 6368 if (NumLo == 3 || NumHi == 3) { 6369 // Otherwise, we must have three elements from one vector, call it X, and 6370 // one element from the other, call it Y. First, use a shufps to build an 6371 // intermediate vector with the one element from Y and the element from X 6372 // that will be in the same half in the final destination (the indexes don't 6373 // matter). Then, use a shufps to build the final vector, taking the half 6374 // containing the element from Y from the intermediate, and the other half 6375 // from X. 6376 if (NumHi == 3) { 6377 // Normalize it so the 3 elements come from V1. 6378 CommuteVectorShuffleMask(PermMask, 4); 6379 std::swap(V1, V2); 6380 } 6381 6382 // Find the element from V2. 6383 unsigned HiIndex; 6384 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 6385 int Val = PermMask[HiIndex]; 6386 if (Val < 0) 6387 continue; 6388 if (Val >= 4) 6389 break; 6390 } 6391 6392 Mask1[0] = PermMask[HiIndex]; 6393 Mask1[1] = -1; 6394 Mask1[2] = PermMask[HiIndex^1]; 6395 Mask1[3] = -1; 6396 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6397 6398 if (HiIndex >= 2) { 6399 Mask1[0] = PermMask[0]; 6400 Mask1[1] = PermMask[1]; 6401 Mask1[2] = HiIndex & 1 ? 6 : 4; 6402 Mask1[3] = HiIndex & 1 ? 4 : 6; 6403 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6404 } 6405 6406 Mask1[0] = HiIndex & 1 ? 2 : 0; 6407 Mask1[1] = HiIndex & 1 ? 0 : 2; 6408 Mask1[2] = PermMask[2]; 6409 Mask1[3] = PermMask[3]; 6410 if (Mask1[2] >= 0) 6411 Mask1[2] += 4; 6412 if (Mask1[3] >= 0) 6413 Mask1[3] += 4; 6414 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 6415 } 6416 6417 // Break it into (shuffle shuffle_hi, shuffle_lo). 6418 int LoMask[] = { -1, -1, -1, -1 }; 6419 int HiMask[] = { -1, -1, -1, -1 }; 6420 6421 int *MaskPtr = LoMask; 6422 unsigned MaskIdx = 0; 6423 unsigned LoIdx = 0; 6424 unsigned HiIdx = 2; 6425 for (unsigned i = 0; i != 4; ++i) { 6426 if (i == 2) { 6427 MaskPtr = HiMask; 6428 MaskIdx = 1; 6429 LoIdx = 0; 6430 HiIdx = 2; 6431 } 6432 int Idx = PermMask[i]; 6433 if (Idx < 0) { 6434 Locs[i] = std::make_pair(-1, -1); 6435 } else if (Idx < 4) { 6436 Locs[i] = std::make_pair(MaskIdx, LoIdx); 6437 MaskPtr[LoIdx] = Idx; 6438 LoIdx++; 6439 } else { 6440 Locs[i] = std::make_pair(MaskIdx, HiIdx); 6441 MaskPtr[HiIdx] = Idx; 6442 HiIdx++; 6443 } 6444 } 6445 6446 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 6447 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 6448 int MaskOps[] = { -1, -1, -1, -1 }; 6449 for (unsigned i = 0; i != 4; ++i) 6450 if (Locs[i].first != -1) 6451 MaskOps[i] = Locs[i].first * 4 + Locs[i].second; 6452 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 6453} 6454 6455static bool MayFoldVectorLoad(SDValue V) { 6456 while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6457 V = V.getOperand(0); 6458 6459 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6460 V = V.getOperand(0); 6461 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && 6462 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) 6463 // BUILD_VECTOR (load), undef 6464 V = V.getOperand(0); 6465 6466 return MayFoldLoad(V); 6467} 6468 6469static 6470SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 6471 EVT VT = Op.getValueType(); 6472 6473 // Canonizalize to v2f64. 6474 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 6475 return DAG.getNode(ISD::BITCAST, dl, VT, 6476 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 6477 V1, DAG)); 6478} 6479 6480static 6481SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 6482 bool HasSSE2) { 6483 SDValue V1 = Op.getOperand(0); 6484 SDValue V2 = Op.getOperand(1); 6485 EVT VT = Op.getValueType(); 6486 6487 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 6488 6489 if (HasSSE2 && VT == MVT::v2f64) 6490 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 6491 6492 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) 6493 return DAG.getNode(ISD::BITCAST, dl, VT, 6494 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, 6495 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), 6496 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); 6497} 6498 6499static 6500SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 6501 SDValue V1 = Op.getOperand(0); 6502 SDValue V2 = Op.getOperand(1); 6503 EVT VT = Op.getValueType(); 6504 6505 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 6506 "unsupported shuffle type"); 6507 6508 if (V2.getOpcode() == ISD::UNDEF) 6509 V2 = V1; 6510 6511 // v4i32 or v4f32 6512 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 6513} 6514 6515static 6516SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 6517 SDValue V1 = Op.getOperand(0); 6518 SDValue V2 = Op.getOperand(1); 6519 EVT VT = Op.getValueType(); 6520 unsigned NumElems = VT.getVectorNumElements(); 6521 6522 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 6523 // operand of these instructions is only memory, so check if there's a 6524 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 6525 // same masks. 6526 bool CanFoldLoad = false; 6527 6528 // Trivial case, when V2 comes from a load. 6529 if (MayFoldVectorLoad(V2)) 6530 CanFoldLoad = true; 6531 6532 // When V1 is a load, it can be folded later into a store in isel, example: 6533 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 6534 // turns into: 6535 // (MOVLPSmr addr:$src1, VR128:$src2) 6536 // So, recognize this potential and also use MOVLPS or MOVLPD 6537 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 6538 CanFoldLoad = true; 6539 6540 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6541 if (CanFoldLoad) { 6542 if (HasSSE2 && NumElems == 2) 6543 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 6544 6545 if (NumElems == 4) 6546 // If we don't care about the second element, proceed to use movss. 6547 if (SVOp->getMaskElt(1) != -1) 6548 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 6549 } 6550 6551 // movl and movlp will both match v2i64, but v2i64 is never matched by 6552 // movl earlier because we make it strict to avoid messing with the movlp load 6553 // folding logic (see the code above getMOVLP call). Match it here then, 6554 // this is horrible, but will stay like this until we move all shuffle 6555 // matching to x86 specific nodes. Note that for the 1st condition all 6556 // types are matched with movsd. 6557 if (HasSSE2) { 6558 // FIXME: isMOVLMask should be checked and matched before getMOVLP, 6559 // as to remove this logic from here, as much as possible 6560 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) 6561 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6562 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6563 } 6564 6565 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 6566 6567 // Invert the operand order and use SHUFPS to match it. 6568 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, 6569 getShuffleSHUFImmediate(SVOp), DAG); 6570} 6571 6572// Reduce a vector shuffle to zext. 6573SDValue 6574X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { 6575 // PMOVZX is only available from SSE41. 6576 if (!Subtarget->hasSSE41()) 6577 return SDValue(); 6578 6579 EVT VT = Op.getValueType(); 6580 6581 // Only AVX2 support 256-bit vector integer extending. 6582 if (!Subtarget->hasInt256() && VT.is256BitVector()) 6583 return SDValue(); 6584 6585 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6586 DebugLoc DL = Op.getDebugLoc(); 6587 SDValue V1 = Op.getOperand(0); 6588 SDValue V2 = Op.getOperand(1); 6589 unsigned NumElems = VT.getVectorNumElements(); 6590 6591 // Extending is an unary operation and the element type of the source vector 6592 // won't be equal to or larger than i64. 6593 if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || 6594 VT.getVectorElementType() == MVT::i64) 6595 return SDValue(); 6596 6597 // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. 6598 unsigned Shift = 1; // Start from 2, i.e. 1 << 1. 6599 while ((1U << Shift) < NumElems) { 6600 if (SVOp->getMaskElt(1U << Shift) == 1) 6601 break; 6602 Shift += 1; 6603 // The maximal ratio is 8, i.e. from i8 to i64. 6604 if (Shift > 3) 6605 return SDValue(); 6606 } 6607 6608 // Check the shuffle mask. 6609 unsigned Mask = (1U << Shift) - 1; 6610 for (unsigned i = 0; i != NumElems; ++i) { 6611 int EltIdx = SVOp->getMaskElt(i); 6612 if ((i & Mask) != 0 && EltIdx != -1) 6613 return SDValue(); 6614 if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) 6615 return SDValue(); 6616 } 6617 6618 unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; 6619 EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits); 6620 EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift); 6621 6622 if (!isTypeLegal(NVT)) 6623 return SDValue(); 6624 6625 // Simplify the operand as it's prepared to be fed into shuffle. 6626 unsigned SignificantBits = NVT.getSizeInBits() >> Shift; 6627 if (V1.getOpcode() == ISD::BITCAST && 6628 V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 6629 V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 6630 V1.getOperand(0) 6631 .getOperand(0).getValueType().getSizeInBits() == SignificantBits) { 6632 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) 6633 SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); 6634 ConstantSDNode *CIdx = 6635 dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); 6636 // If it's foldable, i.e. normal load with single use, we will let code 6637 // selection to fold it. Otherwise, we will short the conversion sequence. 6638 if (CIdx && CIdx->getZExtValue() == 0 && 6639 (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) 6640 V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); 6641 } 6642 6643 return DAG.getNode(ISD::BITCAST, DL, VT, 6644 DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); 6645} 6646 6647SDValue 6648X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { 6649 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6650 EVT VT = Op.getValueType(); 6651 DebugLoc dl = Op.getDebugLoc(); 6652 SDValue V1 = Op.getOperand(0); 6653 SDValue V2 = Op.getOperand(1); 6654 6655 if (isZeroShuffle(SVOp)) 6656 return getZeroVector(VT, Subtarget, DAG, dl); 6657 6658 // Handle splat operations 6659 if (SVOp->isSplat()) { 6660 unsigned NumElem = VT.getVectorNumElements(); 6661 int Size = VT.getSizeInBits(); 6662 6663 // Use vbroadcast whenever the splat comes from a foldable load 6664 SDValue Broadcast = LowerVectorBroadcast(Op, DAG); 6665 if (Broadcast.getNode()) 6666 return Broadcast; 6667 6668 // Handle splats by matching through known shuffle masks 6669 if ((Size == 128 && NumElem <= 4) || 6670 (Size == 256 && NumElem <= 8)) 6671 return SDValue(); 6672 6673 // All remaning splats are promoted to target supported vector shuffles. 6674 return PromoteSplat(SVOp, DAG); 6675 } 6676 6677 // Check integer expanding shuffles. 6678 SDValue NewOp = lowerVectorIntExtend(Op, DAG); 6679 if (NewOp.getNode()) 6680 return NewOp; 6681 6682 // If the shuffle can be profitably rewritten as a narrower shuffle, then 6683 // do it! 6684 if (VT == MVT::v8i16 || VT == MVT::v16i8 || 6685 VT == MVT::v16i16 || VT == MVT::v32i8) { 6686 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6687 if (NewOp.getNode()) 6688 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 6689 } else if ((VT == MVT::v4i32 || 6690 (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 6691 // FIXME: Figure out a cleaner way to do this. 6692 // Try to make use of movq to zero out the top part. 6693 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 6694 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6695 if (NewOp.getNode()) { 6696 EVT NewVT = NewOp.getValueType(); 6697 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), 6698 NewVT, true, false)) 6699 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), 6700 DAG, Subtarget, dl); 6701 } 6702 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 6703 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6704 if (NewOp.getNode()) { 6705 EVT NewVT = NewOp.getValueType(); 6706 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) 6707 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), 6708 DAG, Subtarget, dl); 6709 } 6710 } 6711 } 6712 return SDValue(); 6713} 6714 6715SDValue 6716X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 6717 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6718 SDValue V1 = Op.getOperand(0); 6719 SDValue V2 = Op.getOperand(1); 6720 EVT VT = Op.getValueType(); 6721 DebugLoc dl = Op.getDebugLoc(); 6722 unsigned NumElems = VT.getVectorNumElements(); 6723 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 6724 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6725 bool V1IsSplat = false; 6726 bool V2IsSplat = false; 6727 bool HasSSE2 = Subtarget->hasSSE2(); 6728 bool HasFp256 = Subtarget->hasFp256(); 6729 bool HasInt256 = Subtarget->hasInt256(); 6730 MachineFunction &MF = DAG.getMachineFunction(); 6731 bool OptForSize = MF.getFunction()->getFnAttributes(). 6732 hasAttribute(Attribute::OptimizeForSize); 6733 6734 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 6735 6736 if (V1IsUndef && V2IsUndef) 6737 return DAG.getUNDEF(VT); 6738 6739 assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); 6740 6741 // Vector shuffle lowering takes 3 steps: 6742 // 6743 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 6744 // narrowing and commutation of operands should be handled. 6745 // 2) Matching of shuffles with known shuffle masks to x86 target specific 6746 // shuffle nodes. 6747 // 3) Rewriting of unmatched masks into new generic shuffle operations, 6748 // so the shuffle can be broken into other shuffles and the legalizer can 6749 // try the lowering again. 6750 // 6751 // The general idea is that no vector_shuffle operation should be left to 6752 // be matched during isel, all of them must be converted to a target specific 6753 // node here. 6754 6755 // Normalize the input vectors. Here splats, zeroed vectors, profitable 6756 // narrowing and commutation of operands should be handled. The actual code 6757 // doesn't include all of those, work in progress... 6758 SDValue NewOp = NormalizeVectorShuffle(Op, DAG); 6759 if (NewOp.getNode()) 6760 return NewOp; 6761 6762 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); 6763 6764 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 6765 // unpckh_undef). Only use pshufd if speed is more important than size. 6766 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 6767 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6768 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 6769 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6770 6771 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && 6772 V2IsUndef && MayFoldVectorLoad(V1)) 6773 return getMOVDDup(Op, dl, V1, DAG); 6774 6775 if (isMOVHLPS_v_undef_Mask(M, VT)) 6776 return getMOVHighToLow(Op, dl, DAG); 6777 6778 // Use to match splats 6779 if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && 6780 (VT == MVT::v2f64 || VT == MVT::v2i64)) 6781 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6782 6783 if (isPSHUFDMask(M, VT)) { 6784 // The actual implementation will match the mask in the if above and then 6785 // during isel it can match several different instructions, not only pshufd 6786 // as its name says, sad but true, emulate the behavior for now... 6787 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 6788 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 6789 6790 unsigned TargetMask = getShuffleSHUFImmediate(SVOp); 6791 6792 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 6793 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 6794 6795 if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) 6796 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, 6797 DAG); 6798 6799 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, 6800 TargetMask, DAG); 6801 } 6802 6803 // Check if this can be converted into a logical shift. 6804 bool isLeft = false; 6805 unsigned ShAmt = 0; 6806 SDValue ShVal; 6807 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 6808 if (isShift && ShVal.hasOneUse()) { 6809 // If the shifted value has multiple uses, it may be cheaper to use 6810 // v_set0 + movlhps or movhlps, etc. 6811 EVT EltVT = VT.getVectorElementType(); 6812 ShAmt *= EltVT.getSizeInBits(); 6813 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6814 } 6815 6816 if (isMOVLMask(M, VT)) { 6817 if (ISD::isBuildVectorAllZeros(V1.getNode())) 6818 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 6819 if (!isMOVLPMask(M, VT)) { 6820 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6821 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6822 6823 if (VT == MVT::v4i32 || VT == MVT::v4f32) 6824 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6825 } 6826 } 6827 6828 // FIXME: fold these into legal mask. 6829 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) 6830 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 6831 6832 if (isMOVHLPSMask(M, VT)) 6833 return getMOVHighToLow(Op, dl, DAG); 6834 6835 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) 6836 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 6837 6838 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) 6839 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 6840 6841 if (isMOVLPMask(M, VT)) 6842 return getMOVLP(Op, dl, DAG, HasSSE2); 6843 6844 if (ShouldXformToMOVHLPS(M, VT) || 6845 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) 6846 return CommuteVectorShuffle(SVOp, DAG); 6847 6848 if (isShift) { 6849 // No better options. Use a vshldq / vsrldq. 6850 EVT EltVT = VT.getVectorElementType(); 6851 ShAmt *= EltVT.getSizeInBits(); 6852 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6853 } 6854 6855 bool Commuted = false; 6856 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6857 // 1,1,1,1 -> v8i16 though. 6858 V1IsSplat = isSplatVector(V1.getNode()); 6859 V2IsSplat = isSplatVector(V2.getNode()); 6860 6861 // Canonicalize the splat or undef, if present, to be on the RHS. 6862 if (!V2IsUndef && V1IsSplat && !V2IsSplat) { 6863 CommuteVectorShuffleMask(M, NumElems); 6864 std::swap(V1, V2); 6865 std::swap(V1IsSplat, V2IsSplat); 6866 Commuted = true; 6867 } 6868 6869 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { 6870 // Shuffling low element of v1 into undef, just return v1. 6871 if (V2IsUndef) 6872 return V1; 6873 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6874 // the instruction selector will not match, so get a canonical MOVL with 6875 // swapped operands to undo the commute. 6876 return getMOVL(DAG, dl, VT, V2, V1); 6877 } 6878 6879 if (isUNPCKLMask(M, VT, HasInt256)) 6880 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6881 6882 if (isUNPCKHMask(M, VT, HasInt256)) 6883 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6884 6885 if (V2IsSplat) { 6886 // Normalize mask so all entries that point to V2 points to its first 6887 // element then try to match unpck{h|l} again. If match, return a 6888 // new vector_shuffle with the corrected mask.p 6889 SmallVector<int, 8> NewMask(M.begin(), M.end()); 6890 NormalizeMask(NewMask, NumElems); 6891 if (isUNPCKLMask(NewMask, VT, HasInt256, true)) 6892 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6893 if (isUNPCKHMask(NewMask, VT, HasInt256, true)) 6894 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6895 } 6896 6897 if (Commuted) { 6898 // Commute is back and try unpck* again. 6899 // FIXME: this seems wrong. 6900 CommuteVectorShuffleMask(M, NumElems); 6901 std::swap(V1, V2); 6902 std::swap(V1IsSplat, V2IsSplat); 6903 Commuted = false; 6904 6905 if (isUNPCKLMask(M, VT, HasInt256)) 6906 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6907 6908 if (isUNPCKHMask(M, VT, HasInt256)) 6909 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6910 } 6911 6912 // Normalize the node to match x86 shuffle ops if needed 6913 if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true))) 6914 return CommuteVectorShuffle(SVOp, DAG); 6915 6916 // The checks below are all present in isShuffleMaskLegal, but they are 6917 // inlined here right now to enable us to directly emit target specific 6918 // nodes, and remove one by one until they don't return Op anymore. 6919 6920 if (isPALIGNRMask(M, VT, Subtarget)) 6921 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 6922 getShufflePALIGNRImmediate(SVOp), 6923 DAG); 6924 6925 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6926 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6927 if (VT == MVT::v2f64 || VT == MVT::v2i64) 6928 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6929 } 6930 6931 if (isPSHUFHWMask(M, VT, HasInt256)) 6932 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6933 getShufflePSHUFHWImmediate(SVOp), 6934 DAG); 6935 6936 if (isPSHUFLWMask(M, VT, HasInt256)) 6937 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 6938 getShufflePSHUFLWImmediate(SVOp), 6939 DAG); 6940 6941 if (isSHUFPMask(M, VT, HasFp256)) 6942 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, 6943 getShuffleSHUFImmediate(SVOp), DAG); 6944 6945 if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 6946 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6947 if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 6948 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6949 6950 //===--------------------------------------------------------------------===// 6951 // Generate target specific nodes for 128 or 256-bit shuffles only 6952 // supported in the AVX instruction set. 6953 // 6954 6955 // Handle VMOVDDUPY permutations 6956 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) 6957 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 6958 6959 // Handle VPERMILPS/D* permutations 6960 if (isVPERMILPMask(M, VT, HasFp256)) { 6961 if (HasInt256 && VT == MVT::v8i32) 6962 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, 6963 getShuffleSHUFImmediate(SVOp), DAG); 6964 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, 6965 getShuffleSHUFImmediate(SVOp), DAG); 6966 } 6967 6968 // Handle VPERM2F128/VPERM2I128 permutations 6969 if (isVPERM2X128Mask(M, VT, HasFp256)) 6970 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, 6971 V2, getShuffleVPERM2X128Immediate(SVOp), DAG); 6972 6973 SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); 6974 if (BlendOp.getNode()) 6975 return BlendOp; 6976 6977 if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { 6978 SmallVector<SDValue, 8> permclMask; 6979 for (unsigned i = 0; i != 8; ++i) { 6980 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32)); 6981 } 6982 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, 6983 &permclMask[0], 8); 6984 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 6985 return DAG.getNode(X86ISD::VPERMV, dl, VT, 6986 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); 6987 } 6988 6989 if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64)) 6990 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, 6991 getShuffleCLImmediate(SVOp), DAG); 6992 6993 //===--------------------------------------------------------------------===// 6994 // Since no target specific shuffle was selected for this generic one, 6995 // lower it into other known shuffles. FIXME: this isn't true yet, but 6996 // this is the plan. 6997 // 6998 6999 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 7000 if (VT == MVT::v8i16) { 7001 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); 7002 if (NewOp.getNode()) 7003 return NewOp; 7004 } 7005 7006 if (VT == MVT::v16i8) { 7007 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 7008 if (NewOp.getNode()) 7009 return NewOp; 7010 } 7011 7012 if (VT == MVT::v32i8) { 7013 SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); 7014 if (NewOp.getNode()) 7015 return NewOp; 7016 } 7017 7018 // Handle all 128-bit wide vectors with 4 elements, and match them with 7019 // several different shuffle types. 7020 if (NumElems == 4 && VT.is128BitVector()) 7021 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 7022 7023 // Handle general 256-bit shuffles 7024 if (VT.is256BitVector()) 7025 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 7026 7027 return SDValue(); 7028} 7029 7030SDValue 7031X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 7032 SelectionDAG &DAG) const { 7033 EVT VT = Op.getValueType(); 7034 DebugLoc dl = Op.getDebugLoc(); 7035 7036 if (!Op.getOperand(0).getValueType().is128BitVector()) 7037 return SDValue(); 7038 7039 if (VT.getSizeInBits() == 8) { 7040 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 7041 Op.getOperand(0), Op.getOperand(1)); 7042 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 7043 DAG.getValueType(VT)); 7044 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7045 } 7046 7047 if (VT.getSizeInBits() == 16) { 7048 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7049 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 7050 if (Idx == 0) 7051 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 7052 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7053 DAG.getNode(ISD::BITCAST, dl, 7054 MVT::v4i32, 7055 Op.getOperand(0)), 7056 Op.getOperand(1))); 7057 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 7058 Op.getOperand(0), Op.getOperand(1)); 7059 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 7060 DAG.getValueType(VT)); 7061 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7062 } 7063 7064 if (VT == MVT::f32) { 7065 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 7066 // the result back to FR32 register. It's only worth matching if the 7067 // result has a single use which is a store or a bitcast to i32. And in 7068 // the case of a store, it's not worth it if the index is a constant 0, 7069 // because a MOVSSmr can be used instead, which is smaller and faster. 7070 if (!Op.hasOneUse()) 7071 return SDValue(); 7072 SDNode *User = *Op.getNode()->use_begin(); 7073 if ((User->getOpcode() != ISD::STORE || 7074 (isa<ConstantSDNode>(Op.getOperand(1)) && 7075 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 7076 (User->getOpcode() != ISD::BITCAST || 7077 User->getValueType(0) != MVT::i32)) 7078 return SDValue(); 7079 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7080 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 7081 Op.getOperand(0)), 7082 Op.getOperand(1)); 7083 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 7084 } 7085 7086 if (VT == MVT::i32 || VT == MVT::i64) { 7087 // ExtractPS/pextrq works with constant index. 7088 if (isa<ConstantSDNode>(Op.getOperand(1))) 7089 return Op; 7090 } 7091 return SDValue(); 7092} 7093 7094SDValue 7095X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 7096 SelectionDAG &DAG) const { 7097 if (!isa<ConstantSDNode>(Op.getOperand(1))) 7098 return SDValue(); 7099 7100 SDValue Vec = Op.getOperand(0); 7101 EVT VecVT = Vec.getValueType(); 7102 7103 // If this is a 256-bit vector result, first extract the 128-bit vector and 7104 // then extract the element from the 128-bit vector. 7105 if (VecVT.is256BitVector()) { 7106 DebugLoc dl = Op.getNode()->getDebugLoc(); 7107 unsigned NumElems = VecVT.getVectorNumElements(); 7108 SDValue Idx = Op.getOperand(1); 7109 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7110 7111 // Get the 128-bit vector. 7112 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); 7113 7114 if (IdxVal >= NumElems/2) 7115 IdxVal -= NumElems/2; 7116 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 7117 DAG.getConstant(IdxVal, MVT::i32)); 7118 } 7119 7120 assert(VecVT.is128BitVector() && "Unexpected vector length"); 7121 7122 if (Subtarget->hasSSE41()) { 7123 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 7124 if (Res.getNode()) 7125 return Res; 7126 } 7127 7128 EVT VT = Op.getValueType(); 7129 DebugLoc dl = Op.getDebugLoc(); 7130 // TODO: handle v16i8. 7131 if (VT.getSizeInBits() == 16) { 7132 SDValue Vec = Op.getOperand(0); 7133 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7134 if (Idx == 0) 7135 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 7136 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 7137 DAG.getNode(ISD::BITCAST, dl, 7138 MVT::v4i32, Vec), 7139 Op.getOperand(1))); 7140 // Transform it so it match pextrw which produces a 32-bit result. 7141 EVT EltVT = MVT::i32; 7142 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 7143 Op.getOperand(0), Op.getOperand(1)); 7144 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 7145 DAG.getValueType(VT)); 7146 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 7147 } 7148 7149 if (VT.getSizeInBits() == 32) { 7150 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7151 if (Idx == 0) 7152 return Op; 7153 7154 // SHUFPS the element to the lowest double word, then movss. 7155 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 7156 EVT VVT = Op.getOperand(0).getValueType(); 7157 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7158 DAG.getUNDEF(VVT), Mask); 7159 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7160 DAG.getIntPtrConstant(0)); 7161 } 7162 7163 if (VT.getSizeInBits() == 64) { 7164 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 7165 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 7166 // to match extract_elt for f64. 7167 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7168 if (Idx == 0) 7169 return Op; 7170 7171 // UNPCKHPD the element to the lowest double word, then movsd. 7172 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 7173 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 7174 int Mask[2] = { 1, -1 }; 7175 EVT VVT = Op.getOperand(0).getValueType(); 7176 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 7177 DAG.getUNDEF(VVT), Mask); 7178 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 7179 DAG.getIntPtrConstant(0)); 7180 } 7181 7182 return SDValue(); 7183} 7184 7185SDValue 7186X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 7187 SelectionDAG &DAG) const { 7188 EVT VT = Op.getValueType(); 7189 EVT EltVT = VT.getVectorElementType(); 7190 DebugLoc dl = Op.getDebugLoc(); 7191 7192 SDValue N0 = Op.getOperand(0); 7193 SDValue N1 = Op.getOperand(1); 7194 SDValue N2 = Op.getOperand(2); 7195 7196 if (!VT.is128BitVector()) 7197 return SDValue(); 7198 7199 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 7200 isa<ConstantSDNode>(N2)) { 7201 unsigned Opc; 7202 if (VT == MVT::v8i16) 7203 Opc = X86ISD::PINSRW; 7204 else if (VT == MVT::v16i8) 7205 Opc = X86ISD::PINSRB; 7206 else 7207 Opc = X86ISD::PINSRB; 7208 7209 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 7210 // argument. 7211 if (N1.getValueType() != MVT::i32) 7212 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7213 if (N2.getValueType() != MVT::i32) 7214 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7215 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 7216 } 7217 7218 if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 7219 // Bits [7:6] of the constant are the source select. This will always be 7220 // zero here. The DAG Combiner may combine an extract_elt index into these 7221 // bits. For example (insert (extract, 3), 2) could be matched by putting 7222 // the '3' into bits [7:6] of X86ISD::INSERTPS. 7223 // Bits [5:4] of the constant are the destination select. This is the 7224 // value of the incoming immediate. 7225 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 7226 // combine either bitwise AND or insert of float 0.0 to set these bits. 7227 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 7228 // Create this as a scalar to vector.. 7229 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 7230 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 7231 } 7232 7233 if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { 7234 // PINSR* works with constant index. 7235 return Op; 7236 } 7237 return SDValue(); 7238} 7239 7240SDValue 7241X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 7242 EVT VT = Op.getValueType(); 7243 EVT EltVT = VT.getVectorElementType(); 7244 7245 DebugLoc dl = Op.getDebugLoc(); 7246 SDValue N0 = Op.getOperand(0); 7247 SDValue N1 = Op.getOperand(1); 7248 SDValue N2 = Op.getOperand(2); 7249 7250 // If this is a 256-bit vector result, first extract the 128-bit vector, 7251 // insert the element into the extracted half and then place it back. 7252 if (VT.is256BitVector()) { 7253 if (!isa<ConstantSDNode>(N2)) 7254 return SDValue(); 7255 7256 // Get the desired 128-bit vector half. 7257 unsigned NumElems = VT.getVectorNumElements(); 7258 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 7259 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); 7260 7261 // Insert the element into the desired half. 7262 bool Upper = IdxVal >= NumElems/2; 7263 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 7264 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32)); 7265 7266 // Insert the changed part back to the 256-bit vector 7267 return Insert128BitVector(N0, V, IdxVal, DAG, dl); 7268 } 7269 7270 if (Subtarget->hasSSE41()) 7271 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 7272 7273 if (EltVT == MVT::i8) 7274 return SDValue(); 7275 7276 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 7277 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 7278 // as its second argument. 7279 if (N1.getValueType() != MVT::i32) 7280 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7281 if (N2.getValueType() != MVT::i32) 7282 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7283 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 7284 } 7285 return SDValue(); 7286} 7287 7288static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 7289 LLVMContext *Context = DAG.getContext(); 7290 DebugLoc dl = Op.getDebugLoc(); 7291 EVT OpVT = Op.getValueType(); 7292 7293 // If this is a 256-bit vector result, first insert into a 128-bit 7294 // vector and then insert into the 256-bit vector. 7295 if (!OpVT.is128BitVector()) { 7296 // Insert into a 128-bit vector. 7297 EVT VT128 = EVT::getVectorVT(*Context, 7298 OpVT.getVectorElementType(), 7299 OpVT.getVectorNumElements() / 2); 7300 7301 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 7302 7303 // Insert the 128-bit vector. 7304 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 7305 } 7306 7307 if (OpVT == MVT::v1i64 && 7308 Op.getOperand(0).getValueType() == MVT::i64) 7309 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 7310 7311 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 7312 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 7313 return DAG.getNode(ISD::BITCAST, dl, OpVT, 7314 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 7315} 7316 7317// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 7318// a simple subregister reference or explicit instructions to grab 7319// upper bits of a vector. 7320static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7321 SelectionDAG &DAG) { 7322 if (Subtarget->hasFp256()) { 7323 DebugLoc dl = Op.getNode()->getDebugLoc(); 7324 SDValue Vec = Op.getNode()->getOperand(0); 7325 SDValue Idx = Op.getNode()->getOperand(1); 7326 7327 if (Op.getNode()->getValueType(0).is128BitVector() && 7328 Vec.getNode()->getValueType(0).is256BitVector() && 7329 isa<ConstantSDNode>(Idx)) { 7330 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7331 return Extract128BitVector(Vec, IdxVal, DAG, dl); 7332 } 7333 } 7334 return SDValue(); 7335} 7336 7337// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 7338// simple superregister reference or explicit instructions to insert 7339// the upper bits of a vector. 7340static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 7341 SelectionDAG &DAG) { 7342 if (Subtarget->hasFp256()) { 7343 DebugLoc dl = Op.getNode()->getDebugLoc(); 7344 SDValue Vec = Op.getNode()->getOperand(0); 7345 SDValue SubVec = Op.getNode()->getOperand(1); 7346 SDValue Idx = Op.getNode()->getOperand(2); 7347 7348 if (Op.getNode()->getValueType(0).is256BitVector() && 7349 SubVec.getNode()->getValueType(0).is128BitVector() && 7350 isa<ConstantSDNode>(Idx)) { 7351 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7352 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 7353 } 7354 } 7355 return SDValue(); 7356} 7357 7358// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 7359// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 7360// one of the above mentioned nodes. It has to be wrapped because otherwise 7361// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 7362// be used to form addressing mode. These wrapped nodes will be selected 7363// into MOV32ri. 7364SDValue 7365X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 7366 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 7367 7368 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7369 // global base reg. 7370 unsigned char OpFlag = 0; 7371 unsigned WrapperKind = X86ISD::Wrapper; 7372 CodeModel::Model M = getTargetMachine().getCodeModel(); 7373 7374 if (Subtarget->isPICStyleRIPRel() && 7375 (M == CodeModel::Small || M == CodeModel::Kernel)) 7376 WrapperKind = X86ISD::WrapperRIP; 7377 else if (Subtarget->isPICStyleGOT()) 7378 OpFlag = X86II::MO_GOTOFF; 7379 else if (Subtarget->isPICStyleStubPIC()) 7380 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7381 7382 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 7383 CP->getAlignment(), 7384 CP->getOffset(), OpFlag); 7385 DebugLoc DL = CP->getDebugLoc(); 7386 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7387 // With PIC, the address is actually $g + Offset. 7388 if (OpFlag) { 7389 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7390 DAG.getNode(X86ISD::GlobalBaseReg, 7391 DebugLoc(), getPointerTy()), 7392 Result); 7393 } 7394 7395 return Result; 7396} 7397 7398SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 7399 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 7400 7401 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7402 // global base reg. 7403 unsigned char OpFlag = 0; 7404 unsigned WrapperKind = X86ISD::Wrapper; 7405 CodeModel::Model M = getTargetMachine().getCodeModel(); 7406 7407 if (Subtarget->isPICStyleRIPRel() && 7408 (M == CodeModel::Small || M == CodeModel::Kernel)) 7409 WrapperKind = X86ISD::WrapperRIP; 7410 else if (Subtarget->isPICStyleGOT()) 7411 OpFlag = X86II::MO_GOTOFF; 7412 else if (Subtarget->isPICStyleStubPIC()) 7413 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7414 7415 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 7416 OpFlag); 7417 DebugLoc DL = JT->getDebugLoc(); 7418 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7419 7420 // With PIC, the address is actually $g + Offset. 7421 if (OpFlag) 7422 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7423 DAG.getNode(X86ISD::GlobalBaseReg, 7424 DebugLoc(), getPointerTy()), 7425 Result); 7426 7427 return Result; 7428} 7429 7430SDValue 7431X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 7432 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 7433 7434 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7435 // global base reg. 7436 unsigned char OpFlag = 0; 7437 unsigned WrapperKind = X86ISD::Wrapper; 7438 CodeModel::Model M = getTargetMachine().getCodeModel(); 7439 7440 if (Subtarget->isPICStyleRIPRel() && 7441 (M == CodeModel::Small || M == CodeModel::Kernel)) { 7442 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 7443 OpFlag = X86II::MO_GOTPCREL; 7444 WrapperKind = X86ISD::WrapperRIP; 7445 } else if (Subtarget->isPICStyleGOT()) { 7446 OpFlag = X86II::MO_GOT; 7447 } else if (Subtarget->isPICStyleStubPIC()) { 7448 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 7449 } else if (Subtarget->isPICStyleStubNoDynamic()) { 7450 OpFlag = X86II::MO_DARWIN_NONLAZY; 7451 } 7452 7453 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 7454 7455 DebugLoc DL = Op.getDebugLoc(); 7456 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7457 7458 // With PIC, the address is actually $g + Offset. 7459 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 7460 !Subtarget->is64Bit()) { 7461 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7462 DAG.getNode(X86ISD::GlobalBaseReg, 7463 DebugLoc(), getPointerTy()), 7464 Result); 7465 } 7466 7467 // For symbols that require a load from a stub to get the address, emit the 7468 // load. 7469 if (isGlobalStubReference(OpFlag)) 7470 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 7471 MachinePointerInfo::getGOT(), false, false, false, 0); 7472 7473 return Result; 7474} 7475 7476SDValue 7477X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 7478 // Create the TargetBlockAddressAddress node. 7479 unsigned char OpFlags = 7480 Subtarget->ClassifyBlockAddressReference(); 7481 CodeModel::Model M = getTargetMachine().getCodeModel(); 7482 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 7483 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); 7484 DebugLoc dl = Op.getDebugLoc(); 7485 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, 7486 OpFlags); 7487 7488 if (Subtarget->isPICStyleRIPRel() && 7489 (M == CodeModel::Small || M == CodeModel::Kernel)) 7490 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7491 else 7492 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7493 7494 // With PIC, the address is actually $g + Offset. 7495 if (isGlobalRelativeToPICBase(OpFlags)) { 7496 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7497 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7498 Result); 7499 } 7500 7501 return Result; 7502} 7503 7504SDValue 7505X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 7506 int64_t Offset, 7507 SelectionDAG &DAG) const { 7508 // Create the TargetGlobalAddress node, folding in the constant 7509 // offset if it is legal. 7510 unsigned char OpFlags = 7511 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 7512 CodeModel::Model M = getTargetMachine().getCodeModel(); 7513 SDValue Result; 7514 if (OpFlags == X86II::MO_NO_FLAG && 7515 X86::isOffsetSuitableForCodeModel(Offset, M)) { 7516 // A direct static reference to a global. 7517 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 7518 Offset = 0; 7519 } else { 7520 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 7521 } 7522 7523 if (Subtarget->isPICStyleRIPRel() && 7524 (M == CodeModel::Small || M == CodeModel::Kernel)) 7525 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7526 else 7527 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7528 7529 // With PIC, the address is actually $g + Offset. 7530 if (isGlobalRelativeToPICBase(OpFlags)) { 7531 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7532 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7533 Result); 7534 } 7535 7536 // For globals that require a load from a stub to get the address, emit the 7537 // load. 7538 if (isGlobalStubReference(OpFlags)) 7539 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 7540 MachinePointerInfo::getGOT(), false, false, false, 0); 7541 7542 // If there was a non-zero offset that we didn't fold, create an explicit 7543 // addition for it. 7544 if (Offset != 0) 7545 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 7546 DAG.getConstant(Offset, getPointerTy())); 7547 7548 return Result; 7549} 7550 7551SDValue 7552X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 7553 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 7554 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 7555 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 7556} 7557 7558static SDValue 7559GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 7560 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 7561 unsigned char OperandFlags, bool LocalDynamic = false) { 7562 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7563 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7564 DebugLoc dl = GA->getDebugLoc(); 7565 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7566 GA->getValueType(0), 7567 GA->getOffset(), 7568 OperandFlags); 7569 7570 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR 7571 : X86ISD::TLSADDR; 7572 7573 if (InFlag) { 7574 SDValue Ops[] = { Chain, TGA, *InFlag }; 7575 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3); 7576 } else { 7577 SDValue Ops[] = { Chain, TGA }; 7578 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2); 7579 } 7580 7581 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 7582 MFI->setAdjustsStack(true); 7583 7584 SDValue Flag = Chain.getValue(1); 7585 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 7586} 7587 7588// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 7589static SDValue 7590LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7591 const EVT PtrVT) { 7592 SDValue InFlag; 7593 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 7594 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 7595 DAG.getNode(X86ISD::GlobalBaseReg, 7596 DebugLoc(), PtrVT), InFlag); 7597 InFlag = Chain.getValue(1); 7598 7599 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 7600} 7601 7602// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 7603static SDValue 7604LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7605 const EVT PtrVT) { 7606 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 7607 X86::RAX, X86II::MO_TLSGD); 7608} 7609 7610static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, 7611 SelectionDAG &DAG, 7612 const EVT PtrVT, 7613 bool is64Bit) { 7614 DebugLoc dl = GA->getDebugLoc(); 7615 7616 // Get the start address of the TLS block for this module. 7617 X86MachineFunctionInfo* MFI = DAG.getMachineFunction() 7618 .getInfo<X86MachineFunctionInfo>(); 7619 MFI->incNumLocalDynamicTLSAccesses(); 7620 7621 SDValue Base; 7622 if (is64Bit) { 7623 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, 7624 X86II::MO_TLSLD, /*LocalDynamic=*/true); 7625 } else { 7626 SDValue InFlag; 7627 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 7628 DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag); 7629 InFlag = Chain.getValue(1); 7630 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, 7631 X86II::MO_TLSLDM, /*LocalDynamic=*/true); 7632 } 7633 7634 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations 7635 // of Base. 7636 7637 // Build x@dtpoff. 7638 unsigned char OperandFlags = X86II::MO_DTPOFF; 7639 unsigned WrapperKind = X86ISD::Wrapper; 7640 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7641 GA->getValueType(0), 7642 GA->getOffset(), OperandFlags); 7643 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 7644 7645 // Add x@dtpoff with the base. 7646 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); 7647} 7648 7649// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. 7650static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7651 const EVT PtrVT, TLSModel::Model model, 7652 bool is64Bit, bool isPIC) { 7653 DebugLoc dl = GA->getDebugLoc(); 7654 7655 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 7656 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 7657 is64Bit ? 257 : 256)); 7658 7659 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 7660 DAG.getIntPtrConstant(0), 7661 MachinePointerInfo(Ptr), 7662 false, false, false, 0); 7663 7664 unsigned char OperandFlags = 0; 7665 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 7666 // initialexec. 7667 unsigned WrapperKind = X86ISD::Wrapper; 7668 if (model == TLSModel::LocalExec) { 7669 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 7670 } else if (model == TLSModel::InitialExec) { 7671 if (is64Bit) { 7672 OperandFlags = X86II::MO_GOTTPOFF; 7673 WrapperKind = X86ISD::WrapperRIP; 7674 } else { 7675 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; 7676 } 7677 } else { 7678 llvm_unreachable("Unexpected model"); 7679 } 7680 7681 // emit "addl x@ntpoff,%eax" (local exec) 7682 // or "addl x@indntpoff,%eax" (initial exec) 7683 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) 7684 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7685 GA->getValueType(0), 7686 GA->getOffset(), OperandFlags); 7687 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 7688 7689 if (model == TLSModel::InitialExec) { 7690 if (isPIC && !is64Bit) { 7691 Offset = DAG.getNode(ISD::ADD, dl, PtrVT, 7692 DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), 7693 Offset); 7694 } 7695 7696 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 7697 MachinePointerInfo::getGOT(), false, false, false, 7698 0); 7699 } 7700 7701 // The address of the thread local variable is the add of the thread 7702 // pointer with the offset of the variable. 7703 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 7704} 7705 7706SDValue 7707X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 7708 7709 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 7710 const GlobalValue *GV = GA->getGlobal(); 7711 7712 if (Subtarget->isTargetELF()) { 7713 TLSModel::Model model = getTargetMachine().getTLSModel(GV); 7714 7715 switch (model) { 7716 case TLSModel::GeneralDynamic: 7717 if (Subtarget->is64Bit()) 7718 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 7719 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 7720 case TLSModel::LocalDynamic: 7721 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), 7722 Subtarget->is64Bit()); 7723 case TLSModel::InitialExec: 7724 case TLSModel::LocalExec: 7725 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 7726 Subtarget->is64Bit(), 7727 getTargetMachine().getRelocationModel() == Reloc::PIC_); 7728 } 7729 llvm_unreachable("Unknown TLS model."); 7730 } 7731 7732 if (Subtarget->isTargetDarwin()) { 7733 // Darwin only has one model of TLS. Lower to that. 7734 unsigned char OpFlag = 0; 7735 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 7736 X86ISD::WrapperRIP : X86ISD::Wrapper; 7737 7738 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7739 // global base reg. 7740 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 7741 !Subtarget->is64Bit(); 7742 if (PIC32) 7743 OpFlag = X86II::MO_TLVP_PIC_BASE; 7744 else 7745 OpFlag = X86II::MO_TLVP; 7746 DebugLoc DL = Op.getDebugLoc(); 7747 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 7748 GA->getValueType(0), 7749 GA->getOffset(), OpFlag); 7750 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7751 7752 // With PIC32, the address is actually $g + Offset. 7753 if (PIC32) 7754 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7755 DAG.getNode(X86ISD::GlobalBaseReg, 7756 DebugLoc(), getPointerTy()), 7757 Offset); 7758 7759 // Lowering the machine isd will make sure everything is in the right 7760 // location. 7761 SDValue Chain = DAG.getEntryNode(); 7762 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7763 SDValue Args[] = { Chain, Offset }; 7764 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 7765 7766 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 7767 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7768 MFI->setAdjustsStack(true); 7769 7770 // And our return value (tls address) is in the standard call return value 7771 // location. 7772 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 7773 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), 7774 Chain.getValue(1)); 7775 } 7776 7777 if (Subtarget->isTargetWindows()) { 7778 // Just use the implicit TLS architecture 7779 // Need to generate someting similar to: 7780 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 7781 // ; from TEB 7782 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 7783 // mov rcx, qword [rdx+rcx*8] 7784 // mov eax, .tls$:tlsvar 7785 // [rax+rcx] contains the address 7786 // Windows 64bit: gs:0x58 7787 // Windows 32bit: fs:__tls_array 7788 7789 // If GV is an alias then use the aliasee for determining 7790 // thread-localness. 7791 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 7792 GV = GA->resolveAliasedGlobal(false); 7793 DebugLoc dl = GA->getDebugLoc(); 7794 SDValue Chain = DAG.getEntryNode(); 7795 7796 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 7797 // %gs:0x58 (64-bit). 7798 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() 7799 ? Type::getInt8PtrTy(*DAG.getContext(), 7800 256) 7801 : Type::getInt32PtrTy(*DAG.getContext(), 7802 257)); 7803 7804 SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, 7805 Subtarget->is64Bit() 7806 ? DAG.getIntPtrConstant(0x58) 7807 : DAG.getExternalSymbol("_tls_array", 7808 getPointerTy()), 7809 MachinePointerInfo(Ptr), 7810 false, false, false, 0); 7811 7812 // Load the _tls_index variable 7813 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); 7814 if (Subtarget->is64Bit()) 7815 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, 7816 IDX, MachinePointerInfo(), MVT::i32, 7817 false, false, 0); 7818 else 7819 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), 7820 false, false, false, 0); 7821 7822 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), 7823 getPointerTy()); 7824 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); 7825 7826 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); 7827 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), 7828 false, false, false, 0); 7829 7830 // Get the offset of start of .tls section 7831 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7832 GA->getValueType(0), 7833 GA->getOffset(), X86II::MO_SECREL); 7834 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); 7835 7836 // The address of the thread local variable is the add of the thread 7837 // pointer with the offset of the variable. 7838 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); 7839 } 7840 7841 llvm_unreachable("TLS not implemented for this target."); 7842} 7843 7844/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values 7845/// and take a 2 x i32 value to shift plus a shift amount. 7846SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ 7847 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 7848 EVT VT = Op.getValueType(); 7849 unsigned VTBits = VT.getSizeInBits(); 7850 DebugLoc dl = Op.getDebugLoc(); 7851 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 7852 SDValue ShOpLo = Op.getOperand(0); 7853 SDValue ShOpHi = Op.getOperand(1); 7854 SDValue ShAmt = Op.getOperand(2); 7855 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 7856 DAG.getConstant(VTBits - 1, MVT::i8)) 7857 : DAG.getConstant(0, VT); 7858 7859 SDValue Tmp2, Tmp3; 7860 if (Op.getOpcode() == ISD::SHL_PARTS) { 7861 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 7862 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 7863 } else { 7864 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 7865 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 7866 } 7867 7868 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 7869 DAG.getConstant(VTBits, MVT::i8)); 7870 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 7871 AndNode, DAG.getConstant(0, MVT::i8)); 7872 7873 SDValue Hi, Lo; 7874 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7875 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 7876 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 7877 7878 if (Op.getOpcode() == ISD::SHL_PARTS) { 7879 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7880 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7881 } else { 7882 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7883 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7884 } 7885 7886 SDValue Ops[2] = { Lo, Hi }; 7887 return DAG.getMergeValues(Ops, 2, dl); 7888} 7889 7890SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 7891 SelectionDAG &DAG) const { 7892 EVT SrcVT = Op.getOperand(0).getValueType(); 7893 7894 if (SrcVT.isVector()) 7895 return SDValue(); 7896 7897 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 7898 "Unknown SINT_TO_FP to lower!"); 7899 7900 // These are really Legal; return the operand so the caller accepts it as 7901 // Legal. 7902 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 7903 return Op; 7904 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 7905 Subtarget->is64Bit()) { 7906 return Op; 7907 } 7908 7909 DebugLoc dl = Op.getDebugLoc(); 7910 unsigned Size = SrcVT.getSizeInBits()/8; 7911 MachineFunction &MF = DAG.getMachineFunction(); 7912 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 7913 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7914 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7915 StackSlot, 7916 MachinePointerInfo::getFixedStack(SSFI), 7917 false, false, 0); 7918 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 7919} 7920 7921SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 7922 SDValue StackSlot, 7923 SelectionDAG &DAG) const { 7924 // Build the FILD 7925 DebugLoc DL = Op.getDebugLoc(); 7926 SDVTList Tys; 7927 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 7928 if (useSSE) 7929 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 7930 else 7931 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 7932 7933 unsigned ByteSize = SrcVT.getSizeInBits()/8; 7934 7935 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 7936 MachineMemOperand *MMO; 7937 if (FI) { 7938 int SSFI = FI->getIndex(); 7939 MMO = 7940 DAG.getMachineFunction() 7941 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7942 MachineMemOperand::MOLoad, ByteSize, ByteSize); 7943 } else { 7944 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 7945 StackSlot = StackSlot.getOperand(1); 7946 } 7947 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 7948 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 7949 X86ISD::FILD, DL, 7950 Tys, Ops, array_lengthof(Ops), 7951 SrcVT, MMO); 7952 7953 if (useSSE) { 7954 Chain = Result.getValue(1); 7955 SDValue InFlag = Result.getValue(2); 7956 7957 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 7958 // shouldn't be necessary except that RFP cannot be live across 7959 // multiple blocks. When stackifier is fixed, they can be uncoupled. 7960 MachineFunction &MF = DAG.getMachineFunction(); 7961 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 7962 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 7963 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7964 Tys = DAG.getVTList(MVT::Other); 7965 SDValue Ops[] = { 7966 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 7967 }; 7968 MachineMemOperand *MMO = 7969 DAG.getMachineFunction() 7970 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7971 MachineMemOperand::MOStore, SSFISize, SSFISize); 7972 7973 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 7974 Ops, array_lengthof(Ops), 7975 Op.getValueType(), MMO); 7976 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 7977 MachinePointerInfo::getFixedStack(SSFI), 7978 false, false, false, 0); 7979 } 7980 7981 return Result; 7982} 7983 7984// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 7985SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 7986 SelectionDAG &DAG) const { 7987 // This algorithm is not obvious. Here it is what we're trying to output: 7988 /* 7989 movq %rax, %xmm0 7990 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 7991 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 7992 #ifdef __SSE3__ 7993 haddpd %xmm0, %xmm0 7994 #else 7995 pshufd $0x4e, %xmm0, %xmm1 7996 addpd %xmm1, %xmm0 7997 #endif 7998 */ 7999 8000 DebugLoc dl = Op.getDebugLoc(); 8001 LLVMContext *Context = DAG.getContext(); 8002 8003 // Build some magic constants. 8004 const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 8005 Constant *C0 = ConstantDataVector::get(*Context, CV0); 8006 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 8007 8008 SmallVector<Constant*,2> CV1; 8009 CV1.push_back( 8010 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 8011 CV1.push_back( 8012 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 8013 Constant *C1 = ConstantVector::get(CV1); 8014 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 8015 8016 // Load the 64-bit value into an XMM register. 8017 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 8018 Op.getOperand(0)); 8019 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 8020 MachinePointerInfo::getConstantPool(), 8021 false, false, false, 16); 8022 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, 8023 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), 8024 CLod0); 8025 8026 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 8027 MachinePointerInfo::getConstantPool(), 8028 false, false, false, 16); 8029 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); 8030 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 8031 SDValue Result; 8032 8033 if (Subtarget->hasSSE3()) { 8034 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 8035 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 8036 } else { 8037 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); 8038 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 8039 S2F, 0x4E, DAG); 8040 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 8041 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), 8042 Sub); 8043 } 8044 8045 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 8046 DAG.getIntPtrConstant(0)); 8047} 8048 8049// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 8050SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 8051 SelectionDAG &DAG) const { 8052 DebugLoc dl = Op.getDebugLoc(); 8053 // FP constant to bias correct the final result. 8054 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 8055 MVT::f64); 8056 8057 // Load the 32-bit value into an XMM register. 8058 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 8059 Op.getOperand(0)); 8060 8061 // Zero out the upper parts of the register. 8062 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 8063 8064 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 8065 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 8066 DAG.getIntPtrConstant(0)); 8067 8068 // Or the load with the bias. 8069 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 8070 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 8071 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 8072 MVT::v2f64, Load)), 8073 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 8074 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 8075 MVT::v2f64, Bias))); 8076 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 8077 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 8078 DAG.getIntPtrConstant(0)); 8079 8080 // Subtract the bias. 8081 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 8082 8083 // Handle final rounding. 8084 EVT DestVT = Op.getValueType(); 8085 8086 if (DestVT.bitsLT(MVT::f64)) 8087 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 8088 DAG.getIntPtrConstant(0)); 8089 if (DestVT.bitsGT(MVT::f64)) 8090 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 8091 8092 // Handle final rounding. 8093 return Sub; 8094} 8095 8096SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, 8097 SelectionDAG &DAG) const { 8098 SDValue N0 = Op.getOperand(0); 8099 EVT SVT = N0.getValueType(); 8100 DebugLoc dl = Op.getDebugLoc(); 8101 8102 assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || 8103 SVT == MVT::v8i8 || SVT == MVT::v8i16) && 8104 "Custom UINT_TO_FP is not supported!"); 8105 8106 EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements()); 8107 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 8108 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); 8109} 8110 8111SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 8112 SelectionDAG &DAG) const { 8113 SDValue N0 = Op.getOperand(0); 8114 DebugLoc dl = Op.getDebugLoc(); 8115 8116 if (Op.getValueType().isVector()) 8117 return lowerUINT_TO_FP_vec(Op, DAG); 8118 8119 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 8120 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 8121 // the optimization here. 8122 if (DAG.SignBitIsZero(N0)) 8123 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 8124 8125 EVT SrcVT = N0.getValueType(); 8126 EVT DstVT = Op.getValueType(); 8127 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 8128 return LowerUINT_TO_FP_i64(Op, DAG); 8129 if (SrcVT == MVT::i32 && X86ScalarSSEf64) 8130 return LowerUINT_TO_FP_i32(Op, DAG); 8131 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) 8132 return SDValue(); 8133 8134 // Make a 64-bit buffer, and use it to build an FILD. 8135 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 8136 if (SrcVT == MVT::i32) { 8137 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 8138 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 8139 getPointerTy(), StackSlot, WordOff); 8140 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8141 StackSlot, MachinePointerInfo(), 8142 false, false, 0); 8143 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 8144 OffsetSlot, MachinePointerInfo(), 8145 false, false, 0); 8146 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 8147 return Fild; 8148 } 8149 8150 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 8151 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 8152 StackSlot, MachinePointerInfo(), 8153 false, false, 0); 8154 // For i64 source, we need to add the appropriate power of 2 if the input 8155 // was negative. This is the same as the optimization in 8156 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 8157 // we must be careful to do the computation in x87 extended precision, not 8158 // in SSE. (The generic code can't know it's OK to do this, or how to.) 8159 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 8160 MachineMemOperand *MMO = 8161 DAG.getMachineFunction() 8162 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8163 MachineMemOperand::MOLoad, 8, 8); 8164 8165 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 8166 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 8167 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 8168 MVT::i64, MMO); 8169 8170 APInt FF(32, 0x5F800000ULL); 8171 8172 // Check whether the sign bit is set. 8173 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 8174 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 8175 ISD::SETLT); 8176 8177 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 8178 SDValue FudgePtr = DAG.getConstantPool( 8179 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 8180 getPointerTy()); 8181 8182 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 8183 SDValue Zero = DAG.getIntPtrConstant(0); 8184 SDValue Four = DAG.getIntPtrConstant(4); 8185 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 8186 Zero, Four); 8187 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 8188 8189 // Load the value out, extending it from f32 to f80. 8190 // FIXME: Avoid the extend by constructing the right constant pool? 8191 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 8192 FudgePtr, MachinePointerInfo::getConstantPool(), 8193 MVT::f32, false, false, 4); 8194 // Extend everything to 80 bits to force it to be done on x87. 8195 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 8196 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 8197} 8198 8199std::pair<SDValue,SDValue> X86TargetLowering:: 8200FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { 8201 DebugLoc DL = Op.getDebugLoc(); 8202 8203 EVT DstTy = Op.getValueType(); 8204 8205 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { 8206 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 8207 DstTy = MVT::i64; 8208 } 8209 8210 assert(DstTy.getSimpleVT() <= MVT::i64 && 8211 DstTy.getSimpleVT() >= MVT::i16 && 8212 "Unknown FP_TO_INT to lower!"); 8213 8214 // These are really Legal. 8215 if (DstTy == MVT::i32 && 8216 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8217 return std::make_pair(SDValue(), SDValue()); 8218 if (Subtarget->is64Bit() && 8219 DstTy == MVT::i64 && 8220 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8221 return std::make_pair(SDValue(), SDValue()); 8222 8223 // We lower FP->int64 either into FISTP64 followed by a load from a temporary 8224 // stack slot, or into the FTOL runtime function. 8225 MachineFunction &MF = DAG.getMachineFunction(); 8226 unsigned MemSize = DstTy.getSizeInBits()/8; 8227 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8228 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8229 8230 unsigned Opc; 8231 if (!IsSigned && isIntegerTypeFTOL(DstTy)) 8232 Opc = X86ISD::WIN_FTOL; 8233 else 8234 switch (DstTy.getSimpleVT().SimpleTy) { 8235 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 8236 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 8237 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 8238 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 8239 } 8240 8241 SDValue Chain = DAG.getEntryNode(); 8242 SDValue Value = Op.getOperand(0); 8243 EVT TheVT = Op.getOperand(0).getValueType(); 8244 // FIXME This causes a redundant load/store if the SSE-class value is already 8245 // in memory, such as if it is on the callstack. 8246 if (isScalarFPTypeInSSEReg(TheVT)) { 8247 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 8248 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 8249 MachinePointerInfo::getFixedStack(SSFI), 8250 false, false, 0); 8251 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 8252 SDValue Ops[] = { 8253 Chain, StackSlot, DAG.getValueType(TheVT) 8254 }; 8255 8256 MachineMemOperand *MMO = 8257 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8258 MachineMemOperand::MOLoad, MemSize, MemSize); 8259 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 8260 DstTy, MMO); 8261 Chain = Value.getValue(1); 8262 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8263 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8264 } 8265 8266 MachineMemOperand *MMO = 8267 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8268 MachineMemOperand::MOStore, MemSize, MemSize); 8269 8270 if (Opc != X86ISD::WIN_FTOL) { 8271 // Build the FP_TO_INT*_IN_MEM 8272 SDValue Ops[] = { Chain, Value, StackSlot }; 8273 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 8274 Ops, 3, DstTy, MMO); 8275 return std::make_pair(FIST, StackSlot); 8276 } else { 8277 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, 8278 DAG.getVTList(MVT::Other, MVT::Glue), 8279 Chain, Value); 8280 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, 8281 MVT::i32, ftol.getValue(1)); 8282 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, 8283 MVT::i32, eax.getValue(2)); 8284 SDValue Ops[] = { eax, edx }; 8285 SDValue pair = IsReplace 8286 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2) 8287 : DAG.getMergeValues(Ops, 2, DL); 8288 return std::make_pair(pair, SDValue()); 8289 } 8290} 8291 8292SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const { 8293 DebugLoc DL = Op.getDebugLoc(); 8294 EVT VT = Op.getValueType(); 8295 SDValue In = Op.getOperand(0); 8296 EVT SVT = In.getValueType(); 8297 8298 if (!VT.is256BitVector() || !SVT.is128BitVector() || 8299 VT.getVectorNumElements() != SVT.getVectorNumElements()) 8300 return SDValue(); 8301 8302 assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!"); 8303 8304 // AVX2 has better support of integer extending. 8305 if (Subtarget->hasInt256()) 8306 return DAG.getNode(X86ISD::VZEXT, DL, VT, In); 8307 8308 SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In); 8309 static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1}; 8310 SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, 8311 DAG.getVectorShuffle(MVT::v8i16, DL, In, DAG.getUNDEF(MVT::v8i16), &Mask[0])); 8312 8313 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi); 8314} 8315 8316SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 8317 DebugLoc DL = Op.getDebugLoc(); 8318 EVT VT = Op.getValueType(); 8319 SDValue In = Op.getOperand(0); 8320 EVT SVT = In.getValueType(); 8321 8322 if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) { 8323 // On AVX2, v4i64 -> v4i32 becomes VPERMD. 8324 if (Subtarget->hasInt256()) { 8325 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; 8326 In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In); 8327 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), 8328 ShufMask); 8329 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, 8330 DAG.getIntPtrConstant(0)); 8331 } 8332 8333 // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS. 8334 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 8335 DAG.getIntPtrConstant(0)); 8336 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 8337 DAG.getIntPtrConstant(2)); 8338 8339 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 8340 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 8341 8342 // The PSHUFD mask: 8343 static const int ShufMask1[] = {0, 2, 0, 0}; 8344 SDValue Undef = DAG.getUNDEF(VT); 8345 OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1); 8346 OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1); 8347 8348 // The MOVLHPS mask: 8349 static const int ShufMask2[] = {0, 1, 4, 5}; 8350 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2); 8351 } 8352 8353 if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) { 8354 // On AVX2, v8i32 -> v8i16 becomed PSHUFB. 8355 if (Subtarget->hasInt256()) { 8356 In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); 8357 8358 SmallVector<SDValue,32> pshufbMask; 8359 for (unsigned i = 0; i < 2; ++i) { 8360 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); 8361 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); 8362 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); 8363 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); 8364 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); 8365 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); 8366 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); 8367 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); 8368 for (unsigned j = 0; j < 8; ++j) 8369 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 8370 } 8371 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, 8372 &pshufbMask[0], 32); 8373 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); 8374 In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); 8375 8376 static const int ShufMask[] = {0, 2, -1, -1}; 8377 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), 8378 &ShufMask[0]); 8379 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 8380 DAG.getIntPtrConstant(0)); 8381 return DAG.getNode(ISD::BITCAST, DL, VT, In); 8382 } 8383 8384 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 8385 DAG.getIntPtrConstant(0)); 8386 8387 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 8388 DAG.getIntPtrConstant(4)); 8389 8390 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); 8391 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); 8392 8393 // The PSHUFB mask: 8394 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 8395 -1, -1, -1, -1, -1, -1, -1, -1}; 8396 8397 SDValue Undef = DAG.getUNDEF(MVT::v16i8); 8398 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); 8399 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); 8400 8401 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 8402 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 8403 8404 // The MOVLHPS Mask: 8405 static const int ShufMask2[] = {0, 1, 4, 5}; 8406 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); 8407 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res); 8408 } 8409 8410 // Handle truncation of V256 to V128 using shuffles. 8411 if (!VT.is128BitVector() || !SVT.is256BitVector()) 8412 return SDValue(); 8413 8414 assert(VT.getVectorNumElements() != SVT.getVectorNumElements() && 8415 "Invalid op"); 8416 assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); 8417 8418 unsigned NumElems = VT.getVectorNumElements(); 8419 EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 8420 NumElems * 2); 8421 8422 SmallVector<int, 16> MaskVec(NumElems * 2, -1); 8423 // Prepare truncation shuffle mask 8424 for (unsigned i = 0; i != NumElems; ++i) 8425 MaskVec[i] = i * 2; 8426 SDValue V = DAG.getVectorShuffle(NVT, DL, 8427 DAG.getNode(ISD::BITCAST, DL, NVT, In), 8428 DAG.getUNDEF(NVT), &MaskVec[0]); 8429 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, 8430 DAG.getIntPtrConstant(0)); 8431} 8432 8433SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 8434 SelectionDAG &DAG) const { 8435 if (Op.getValueType().isVector()) { 8436 if (Op.getValueType() == MVT::v8i16) 8437 return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(), 8438 DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(), 8439 MVT::v8i32, Op.getOperand(0))); 8440 return SDValue(); 8441 } 8442 8443 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 8444 /*IsSigned=*/ true, /*IsReplace=*/ false); 8445 SDValue FIST = Vals.first, StackSlot = Vals.second; 8446 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 8447 if (FIST.getNode() == 0) return Op; 8448 8449 if (StackSlot.getNode()) 8450 // Load the result. 8451 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 8452 FIST, StackSlot, MachinePointerInfo(), 8453 false, false, false, 0); 8454 8455 // The node is the result. 8456 return FIST; 8457} 8458 8459SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 8460 SelectionDAG &DAG) const { 8461 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 8462 /*IsSigned=*/ false, /*IsReplace=*/ false); 8463 SDValue FIST = Vals.first, StackSlot = Vals.second; 8464 assert(FIST.getNode() && "Unexpected failure"); 8465 8466 if (StackSlot.getNode()) 8467 // Load the result. 8468 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 8469 FIST, StackSlot, MachinePointerInfo(), 8470 false, false, false, 0); 8471 8472 // The node is the result. 8473 return FIST; 8474} 8475 8476SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op, 8477 SelectionDAG &DAG) const { 8478 DebugLoc DL = Op.getDebugLoc(); 8479 EVT VT = Op.getValueType(); 8480 SDValue In = Op.getOperand(0); 8481 EVT SVT = In.getValueType(); 8482 8483 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); 8484 8485 return DAG.getNode(X86ISD::VFPEXT, DL, VT, 8486 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, 8487 In, DAG.getUNDEF(SVT))); 8488} 8489 8490SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { 8491 LLVMContext *Context = DAG.getContext(); 8492 DebugLoc dl = Op.getDebugLoc(); 8493 EVT VT = Op.getValueType(); 8494 EVT EltVT = VT; 8495 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 8496 if (VT.isVector()) { 8497 EltVT = VT.getVectorElementType(); 8498 NumElts = VT.getVectorNumElements(); 8499 } 8500 Constant *C; 8501 if (EltVT == MVT::f64) 8502 C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 8503 else 8504 C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 8505 C = ConstantVector::getSplat(NumElts, C); 8506 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); 8507 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 8508 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8509 MachinePointerInfo::getConstantPool(), 8510 false, false, false, Alignment); 8511 if (VT.isVector()) { 8512 MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 8513 return DAG.getNode(ISD::BITCAST, dl, VT, 8514 DAG.getNode(ISD::AND, dl, ANDVT, 8515 DAG.getNode(ISD::BITCAST, dl, ANDVT, 8516 Op.getOperand(0)), 8517 DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); 8518 } 8519 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 8520} 8521 8522SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 8523 LLVMContext *Context = DAG.getContext(); 8524 DebugLoc dl = Op.getDebugLoc(); 8525 EVT VT = Op.getValueType(); 8526 EVT EltVT = VT; 8527 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 8528 if (VT.isVector()) { 8529 EltVT = VT.getVectorElementType(); 8530 NumElts = VT.getVectorNumElements(); 8531 } 8532 Constant *C; 8533 if (EltVT == MVT::f64) 8534 C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 8535 else 8536 C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 8537 C = ConstantVector::getSplat(NumElts, C); 8538 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); 8539 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 8540 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8541 MachinePointerInfo::getConstantPool(), 8542 false, false, false, Alignment); 8543 if (VT.isVector()) { 8544 MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 8545 return DAG.getNode(ISD::BITCAST, dl, VT, 8546 DAG.getNode(ISD::XOR, dl, XORVT, 8547 DAG.getNode(ISD::BITCAST, dl, XORVT, 8548 Op.getOperand(0)), 8549 DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); 8550 } 8551 8552 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 8553} 8554 8555SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 8556 LLVMContext *Context = DAG.getContext(); 8557 SDValue Op0 = Op.getOperand(0); 8558 SDValue Op1 = Op.getOperand(1); 8559 DebugLoc dl = Op.getDebugLoc(); 8560 EVT VT = Op.getValueType(); 8561 EVT SrcVT = Op1.getValueType(); 8562 8563 // If second operand is smaller, extend it first. 8564 if (SrcVT.bitsLT(VT)) { 8565 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 8566 SrcVT = VT; 8567 } 8568 // And if it is bigger, shrink it first. 8569 if (SrcVT.bitsGT(VT)) { 8570 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 8571 SrcVT = VT; 8572 } 8573 8574 // At this point the operands and the result should have the same 8575 // type, and that won't be f80 since that is not custom lowered. 8576 8577 // First get the sign bit of second operand. 8578 SmallVector<Constant*,4> CV; 8579 if (SrcVT == MVT::f64) { 8580 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 8581 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 8582 } else { 8583 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 8584 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8585 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8586 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8587 } 8588 Constant *C = ConstantVector::get(CV); 8589 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8590 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 8591 MachinePointerInfo::getConstantPool(), 8592 false, false, false, 16); 8593 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 8594 8595 // Shift sign bit right or left if the two operands have different types. 8596 if (SrcVT.bitsGT(VT)) { 8597 // Op0 is MVT::f32, Op1 is MVT::f64. 8598 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 8599 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 8600 DAG.getConstant(32, MVT::i32)); 8601 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 8602 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 8603 DAG.getIntPtrConstant(0)); 8604 } 8605 8606 // Clear first operand sign bit. 8607 CV.clear(); 8608 if (VT == MVT::f64) { 8609 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 8610 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 8611 } else { 8612 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 8613 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8614 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8615 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8616 } 8617 C = ConstantVector::get(CV); 8618 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8619 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8620 MachinePointerInfo::getConstantPool(), 8621 false, false, false, 16); 8622 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 8623 8624 // Or the value with the sign bit. 8625 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 8626} 8627 8628static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { 8629 SDValue N0 = Op.getOperand(0); 8630 DebugLoc dl = Op.getDebugLoc(); 8631 EVT VT = Op.getValueType(); 8632 8633 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 8634 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 8635 DAG.getConstant(1, VT)); 8636 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 8637} 8638 8639// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. 8640// 8641SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const { 8642 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); 8643 8644 if (!Subtarget->hasSSE41()) 8645 return SDValue(); 8646 8647 if (!Op->hasOneUse()) 8648 return SDValue(); 8649 8650 SDNode *N = Op.getNode(); 8651 DebugLoc DL = N->getDebugLoc(); 8652 8653 SmallVector<SDValue, 8> Opnds; 8654 DenseMap<SDValue, unsigned> VecInMap; 8655 EVT VT = MVT::Other; 8656 8657 // Recognize a special case where a vector is casted into wide integer to 8658 // test all 0s. 8659 Opnds.push_back(N->getOperand(0)); 8660 Opnds.push_back(N->getOperand(1)); 8661 8662 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { 8663 SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; 8664 // BFS traverse all OR'd operands. 8665 if (I->getOpcode() == ISD::OR) { 8666 Opnds.push_back(I->getOperand(0)); 8667 Opnds.push_back(I->getOperand(1)); 8668 // Re-evaluate the number of nodes to be traversed. 8669 e += 2; // 2 more nodes (LHS and RHS) are pushed. 8670 continue; 8671 } 8672 8673 // Quit if a non-EXTRACT_VECTOR_ELT 8674 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 8675 return SDValue(); 8676 8677 // Quit if without a constant index. 8678 SDValue Idx = I->getOperand(1); 8679 if (!isa<ConstantSDNode>(Idx)) 8680 return SDValue(); 8681 8682 SDValue ExtractedFromVec = I->getOperand(0); 8683 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); 8684 if (M == VecInMap.end()) { 8685 VT = ExtractedFromVec.getValueType(); 8686 // Quit if not 128/256-bit vector. 8687 if (!VT.is128BitVector() && !VT.is256BitVector()) 8688 return SDValue(); 8689 // Quit if not the same type. 8690 if (VecInMap.begin() != VecInMap.end() && 8691 VT != VecInMap.begin()->first.getValueType()) 8692 return SDValue(); 8693 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; 8694 } 8695 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); 8696 } 8697 8698 assert((VT.is128BitVector() || VT.is256BitVector()) && 8699 "Not extracted from 128-/256-bit vector."); 8700 8701 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; 8702 SmallVector<SDValue, 8> VecIns; 8703 8704 for (DenseMap<SDValue, unsigned>::const_iterator 8705 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { 8706 // Quit if not all elements are used. 8707 if (I->second != FullMask) 8708 return SDValue(); 8709 VecIns.push_back(I->first); 8710 } 8711 8712 EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 8713 8714 // Cast all vectors into TestVT for PTEST. 8715 for (unsigned i = 0, e = VecIns.size(); i < e; ++i) 8716 VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); 8717 8718 // If more than one full vectors are evaluated, OR them first before PTEST. 8719 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { 8720 // Each iteration will OR 2 nodes and append the result until there is only 8721 // 1 node left, i.e. the final OR'd value of all vectors. 8722 SDValue LHS = VecIns[Slot]; 8723 SDValue RHS = VecIns[Slot + 1]; 8724 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); 8725 } 8726 8727 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, 8728 VecIns.back(), VecIns.back()); 8729} 8730 8731/// Emit nodes that will be selected as "test Op0,Op0", or something 8732/// equivalent. 8733SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 8734 SelectionDAG &DAG) const { 8735 DebugLoc dl = Op.getDebugLoc(); 8736 8737 // CF and OF aren't always set the way we want. Determine which 8738 // of these we need. 8739 bool NeedCF = false; 8740 bool NeedOF = false; 8741 switch (X86CC) { 8742 default: break; 8743 case X86::COND_A: case X86::COND_AE: 8744 case X86::COND_B: case X86::COND_BE: 8745 NeedCF = true; 8746 break; 8747 case X86::COND_G: case X86::COND_GE: 8748 case X86::COND_L: case X86::COND_LE: 8749 case X86::COND_O: case X86::COND_NO: 8750 NeedOF = true; 8751 break; 8752 } 8753 8754 // See if we can use the EFLAGS value from the operand instead of 8755 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 8756 // we prove that the arithmetic won't overflow, we can't use OF or CF. 8757 if (Op.getResNo() != 0 || NeedOF || NeedCF) 8758 // Emit a CMP with 0, which is the TEST pattern. 8759 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8760 DAG.getConstant(0, Op.getValueType())); 8761 8762 unsigned Opcode = 0; 8763 unsigned NumOperands = 0; 8764 8765 // Truncate operations may prevent the merge of the SETCC instruction 8766 // and the arithmetic intruction before it. Attempt to truncate the operands 8767 // of the arithmetic instruction and use a reduced bit-width instruction. 8768 bool NeedTruncation = false; 8769 SDValue ArithOp = Op; 8770 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { 8771 SDValue Arith = Op->getOperand(0); 8772 // Both the trunc and the arithmetic op need to have one user each. 8773 if (Arith->hasOneUse()) 8774 switch (Arith.getOpcode()) { 8775 default: break; 8776 case ISD::ADD: 8777 case ISD::SUB: 8778 case ISD::AND: 8779 case ISD::OR: 8780 case ISD::XOR: { 8781 NeedTruncation = true; 8782 ArithOp = Arith; 8783 } 8784 } 8785 } 8786 8787 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation 8788 // which may be the result of a CAST. We use the variable 'Op', which is the 8789 // non-casted variable when we check for possible users. 8790 switch (ArithOp.getOpcode()) { 8791 case ISD::ADD: 8792 // Due to an isel shortcoming, be conservative if this add is likely to be 8793 // selected as part of a load-modify-store instruction. When the root node 8794 // in a match is a store, isel doesn't know how to remap non-chain non-flag 8795 // uses of other nodes in the match, such as the ADD in this case. This 8796 // leads to the ADD being left around and reselected, with the result being 8797 // two adds in the output. Alas, even if none our users are stores, that 8798 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 8799 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 8800 // climbing the DAG back to the root, and it doesn't seem to be worth the 8801 // effort. 8802 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8803 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8804 if (UI->getOpcode() != ISD::CopyToReg && 8805 UI->getOpcode() != ISD::SETCC && 8806 UI->getOpcode() != ISD::STORE) 8807 goto default_case; 8808 8809 if (ConstantSDNode *C = 8810 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { 8811 // An add of one will be selected as an INC. 8812 if (C->getAPIntValue() == 1) { 8813 Opcode = X86ISD::INC; 8814 NumOperands = 1; 8815 break; 8816 } 8817 8818 // An add of negative one (subtract of one) will be selected as a DEC. 8819 if (C->getAPIntValue().isAllOnesValue()) { 8820 Opcode = X86ISD::DEC; 8821 NumOperands = 1; 8822 break; 8823 } 8824 } 8825 8826 // Otherwise use a regular EFLAGS-setting add. 8827 Opcode = X86ISD::ADD; 8828 NumOperands = 2; 8829 break; 8830 case ISD::AND: { 8831 // If the primary and result isn't used, don't bother using X86ISD::AND, 8832 // because a TEST instruction will be better. 8833 bool NonFlagUse = false; 8834 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8835 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 8836 SDNode *User = *UI; 8837 unsigned UOpNo = UI.getOperandNo(); 8838 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 8839 // Look pass truncate. 8840 UOpNo = User->use_begin().getOperandNo(); 8841 User = *User->use_begin(); 8842 } 8843 8844 if (User->getOpcode() != ISD::BRCOND && 8845 User->getOpcode() != ISD::SETCC && 8846 !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { 8847 NonFlagUse = true; 8848 break; 8849 } 8850 } 8851 8852 if (!NonFlagUse) 8853 break; 8854 } 8855 // FALL THROUGH 8856 case ISD::SUB: 8857 case ISD::OR: 8858 case ISD::XOR: 8859 // Due to the ISEL shortcoming noted above, be conservative if this op is 8860 // likely to be selected as part of a load-modify-store instruction. 8861 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8862 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8863 if (UI->getOpcode() == ISD::STORE) 8864 goto default_case; 8865 8866 // Otherwise use a regular EFLAGS-setting instruction. 8867 switch (ArithOp.getOpcode()) { 8868 default: llvm_unreachable("unexpected operator!"); 8869 case ISD::SUB: Opcode = X86ISD::SUB; break; 8870 case ISD::XOR: Opcode = X86ISD::XOR; break; 8871 case ISD::AND: Opcode = X86ISD::AND; break; 8872 case ISD::OR: { 8873 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { 8874 SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG); 8875 if (EFLAGS.getNode()) 8876 return EFLAGS; 8877 } 8878 Opcode = X86ISD::OR; 8879 break; 8880 } 8881 } 8882 8883 NumOperands = 2; 8884 break; 8885 case X86ISD::ADD: 8886 case X86ISD::SUB: 8887 case X86ISD::INC: 8888 case X86ISD::DEC: 8889 case X86ISD::OR: 8890 case X86ISD::XOR: 8891 case X86ISD::AND: 8892 return SDValue(Op.getNode(), 1); 8893 default: 8894 default_case: 8895 break; 8896 } 8897 8898 // If we found that truncation is beneficial, perform the truncation and 8899 // update 'Op'. 8900 if (NeedTruncation) { 8901 EVT VT = Op.getValueType(); 8902 SDValue WideVal = Op->getOperand(0); 8903 EVT WideVT = WideVal.getValueType(); 8904 unsigned ConvertedOp = 0; 8905 // Use a target machine opcode to prevent further DAGCombine 8906 // optimizations that may separate the arithmetic operations 8907 // from the setcc node. 8908 switch (WideVal.getOpcode()) { 8909 default: break; 8910 case ISD::ADD: ConvertedOp = X86ISD::ADD; break; 8911 case ISD::SUB: ConvertedOp = X86ISD::SUB; break; 8912 case ISD::AND: ConvertedOp = X86ISD::AND; break; 8913 case ISD::OR: ConvertedOp = X86ISD::OR; break; 8914 case ISD::XOR: ConvertedOp = X86ISD::XOR; break; 8915 } 8916 8917 if (ConvertedOp) { 8918 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8919 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { 8920 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); 8921 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); 8922 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); 8923 } 8924 } 8925 } 8926 8927 if (Opcode == 0) 8928 // Emit a CMP with 0, which is the TEST pattern. 8929 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8930 DAG.getConstant(0, Op.getValueType())); 8931 8932 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 8933 SmallVector<SDValue, 4> Ops; 8934 for (unsigned i = 0; i != NumOperands; ++i) 8935 Ops.push_back(Op.getOperand(i)); 8936 8937 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 8938 DAG.ReplaceAllUsesWith(Op, New); 8939 return SDValue(New.getNode(), 1); 8940} 8941 8942/// Emit nodes that will be selected as "cmp Op0,Op1", or something 8943/// equivalent. 8944SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 8945 SelectionDAG &DAG) const { 8946 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 8947 if (C->getAPIntValue() == 0) 8948 return EmitTest(Op0, X86CC, DAG); 8949 8950 DebugLoc dl = Op0.getDebugLoc(); 8951 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || 8952 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { 8953 // Use SUB instead of CMP to enable CSE between SUB and CMP. 8954 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); 8955 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, 8956 Op0, Op1); 8957 return SDValue(Sub.getNode(), 1); 8958 } 8959 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 8960} 8961 8962/// Convert a comparison if required by the subtarget. 8963SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, 8964 SelectionDAG &DAG) const { 8965 // If the subtarget does not support the FUCOMI instruction, floating-point 8966 // comparisons have to be converted. 8967 if (Subtarget->hasCMov() || 8968 Cmp.getOpcode() != X86ISD::CMP || 8969 !Cmp.getOperand(0).getValueType().isFloatingPoint() || 8970 !Cmp.getOperand(1).getValueType().isFloatingPoint()) 8971 return Cmp; 8972 8973 // The instruction selector will select an FUCOM instruction instead of 8974 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence 8975 // build an SDNode sequence that transfers the result from FPSW into EFLAGS: 8976 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) 8977 DebugLoc dl = Cmp.getDebugLoc(); 8978 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); 8979 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); 8980 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, 8981 DAG.getConstant(8, MVT::i8)); 8982 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); 8983 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); 8984} 8985 8986static bool isAllOnes(SDValue V) { 8987 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8988 return C && C->isAllOnesValue(); 8989} 8990 8991/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 8992/// if it's possible. 8993SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 8994 DebugLoc dl, SelectionDAG &DAG) const { 8995 SDValue Op0 = And.getOperand(0); 8996 SDValue Op1 = And.getOperand(1); 8997 if (Op0.getOpcode() == ISD::TRUNCATE) 8998 Op0 = Op0.getOperand(0); 8999 if (Op1.getOpcode() == ISD::TRUNCATE) 9000 Op1 = Op1.getOperand(0); 9001 9002 SDValue LHS, RHS; 9003 if (Op1.getOpcode() == ISD::SHL) 9004 std::swap(Op0, Op1); 9005 if (Op0.getOpcode() == ISD::SHL) { 9006 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 9007 if (And00C->getZExtValue() == 1) { 9008 // If we looked past a truncate, check that it's only truncating away 9009 // known zeros. 9010 unsigned BitWidth = Op0.getValueSizeInBits(); 9011 unsigned AndBitWidth = And.getValueSizeInBits(); 9012 if (BitWidth > AndBitWidth) { 9013 APInt Zeros, Ones; 9014 DAG.ComputeMaskedBits(Op0, Zeros, Ones); 9015 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 9016 return SDValue(); 9017 } 9018 LHS = Op1; 9019 RHS = Op0.getOperand(1); 9020 } 9021 } else if (Op1.getOpcode() == ISD::Constant) { 9022 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 9023 uint64_t AndRHSVal = AndRHS->getZExtValue(); 9024 SDValue AndLHS = Op0; 9025 9026 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 9027 LHS = AndLHS.getOperand(0); 9028 RHS = AndLHS.getOperand(1); 9029 } 9030 9031 // Use BT if the immediate can't be encoded in a TEST instruction. 9032 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 9033 LHS = AndLHS; 9034 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); 9035 } 9036 } 9037 9038 if (LHS.getNode()) { 9039 // If the LHS is of the form (x ^ -1) then replace the LHS with x and flip 9040 // the condition code later. 9041 bool Invert = false; 9042 if (LHS.getOpcode() == ISD::XOR && isAllOnes(LHS.getOperand(1))) { 9043 Invert = true; 9044 LHS = LHS.getOperand(0); 9045 } 9046 9047 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 9048 // instruction. Since the shift amount is in-range-or-undefined, we know 9049 // that doing a bittest on the i32 value is ok. We extend to i32 because 9050 // the encoding for the i16 version is larger than the i32 version. 9051 // Also promote i16 to i32 for performance / code size reason. 9052 if (LHS.getValueType() == MVT::i8 || 9053 LHS.getValueType() == MVT::i16) 9054 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 9055 9056 // If the operand types disagree, extend the shift amount to match. Since 9057 // BT ignores high bits (like shifts) we can use anyextend. 9058 if (LHS.getValueType() != RHS.getValueType()) 9059 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 9060 9061 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 9062 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 9063 // Flip the condition if the LHS was a not instruction 9064 if (Invert) 9065 Cond = X86::GetOppositeBranchCondition(Cond); 9066 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9067 DAG.getConstant(Cond, MVT::i8), BT); 9068 } 9069 9070 return SDValue(); 9071} 9072 9073SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 9074 9075 if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); 9076 9077 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 9078 SDValue Op0 = Op.getOperand(0); 9079 SDValue Op1 = Op.getOperand(1); 9080 DebugLoc dl = Op.getDebugLoc(); 9081 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 9082 9083 // Optimize to BT if possible. 9084 // Lower (X & (1 << N)) == 0 to BT(X, N). 9085 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 9086 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 9087 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 9088 Op1.getOpcode() == ISD::Constant && 9089 cast<ConstantSDNode>(Op1)->isNullValue() && 9090 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 9091 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 9092 if (NewSetCC.getNode()) 9093 return NewSetCC; 9094 } 9095 9096 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 9097 // these. 9098 if (Op1.getOpcode() == ISD::Constant && 9099 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 9100 cast<ConstantSDNode>(Op1)->isNullValue()) && 9101 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 9102 9103 // If the input is a setcc, then reuse the input setcc or use a new one with 9104 // the inverted condition. 9105 if (Op0.getOpcode() == X86ISD::SETCC) { 9106 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 9107 bool Invert = (CC == ISD::SETNE) ^ 9108 cast<ConstantSDNode>(Op1)->isNullValue(); 9109 if (!Invert) return Op0; 9110 9111 CCode = X86::GetOppositeBranchCondition(CCode); 9112 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9113 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 9114 } 9115 } 9116 9117 bool isFP = Op1.getValueType().isFloatingPoint(); 9118 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 9119 if (X86CC == X86::COND_INVALID) 9120 return SDValue(); 9121 9122 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 9123 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 9124 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9125 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 9126} 9127 9128// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 9129// ones, and then concatenate the result back. 9130static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 9131 EVT VT = Op.getValueType(); 9132 9133 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && 9134 "Unsupported value type for operation"); 9135 9136 unsigned NumElems = VT.getVectorNumElements(); 9137 DebugLoc dl = Op.getDebugLoc(); 9138 SDValue CC = Op.getOperand(2); 9139 9140 // Extract the LHS vectors 9141 SDValue LHS = Op.getOperand(0); 9142 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 9143 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 9144 9145 // Extract the RHS vectors 9146 SDValue RHS = Op.getOperand(1); 9147 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 9148 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 9149 9150 // Issue the operation on the smaller types and concatenate the result back 9151 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 9152 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 9153 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 9154 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 9155 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 9156} 9157 9158SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 9159 SDValue Cond; 9160 SDValue Op0 = Op.getOperand(0); 9161 SDValue Op1 = Op.getOperand(1); 9162 SDValue CC = Op.getOperand(2); 9163 EVT VT = Op.getValueType(); 9164 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 9165 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 9166 DebugLoc dl = Op.getDebugLoc(); 9167 9168 if (isFP) { 9169#ifndef NDEBUG 9170 EVT EltVT = Op0.getValueType().getVectorElementType(); 9171 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 9172#endif 9173 9174 unsigned SSECC; 9175 bool Swap = false; 9176 9177 // SSE Condition code mapping: 9178 // 0 - EQ 9179 // 1 - LT 9180 // 2 - LE 9181 // 3 - UNORD 9182 // 4 - NEQ 9183 // 5 - NLT 9184 // 6 - NLE 9185 // 7 - ORD 9186 switch (SetCCOpcode) { 9187 default: llvm_unreachable("Unexpected SETCC condition"); 9188 case ISD::SETOEQ: 9189 case ISD::SETEQ: SSECC = 0; break; 9190 case ISD::SETOGT: 9191 case ISD::SETGT: Swap = true; // Fallthrough 9192 case ISD::SETLT: 9193 case ISD::SETOLT: SSECC = 1; break; 9194 case ISD::SETOGE: 9195 case ISD::SETGE: Swap = true; // Fallthrough 9196 case ISD::SETLE: 9197 case ISD::SETOLE: SSECC = 2; break; 9198 case ISD::SETUO: SSECC = 3; break; 9199 case ISD::SETUNE: 9200 case ISD::SETNE: SSECC = 4; break; 9201 case ISD::SETULE: Swap = true; // Fallthrough 9202 case ISD::SETUGE: SSECC = 5; break; 9203 case ISD::SETULT: Swap = true; // Fallthrough 9204 case ISD::SETUGT: SSECC = 6; break; 9205 case ISD::SETO: SSECC = 7; break; 9206 case ISD::SETUEQ: 9207 case ISD::SETONE: SSECC = 8; break; 9208 } 9209 if (Swap) 9210 std::swap(Op0, Op1); 9211 9212 // In the two special cases we can't handle, emit two comparisons. 9213 if (SSECC == 8) { 9214 unsigned CC0, CC1; 9215 unsigned CombineOpc; 9216 if (SetCCOpcode == ISD::SETUEQ) { 9217 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; 9218 } else { 9219 assert(SetCCOpcode == ISD::SETONE); 9220 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; 9221 } 9222 9223 SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 9224 DAG.getConstant(CC0, MVT::i8)); 9225 SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 9226 DAG.getConstant(CC1, MVT::i8)); 9227 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); 9228 } 9229 // Handle all other FP comparisons here. 9230 return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 9231 DAG.getConstant(SSECC, MVT::i8)); 9232 } 9233 9234 // Break 256-bit integer vector compare into smaller ones. 9235 if (VT.is256BitVector() && !Subtarget->hasInt256()) 9236 return Lower256IntVSETCC(Op, DAG); 9237 9238 // We are handling one of the integer comparisons here. Since SSE only has 9239 // GT and EQ comparisons for integer, swapping operands and multiple 9240 // operations may be required for some comparisons. 9241 unsigned Opc; 9242 bool Swap = false, Invert = false, FlipSigns = false; 9243 9244 switch (SetCCOpcode) { 9245 default: llvm_unreachable("Unexpected SETCC condition"); 9246 case ISD::SETNE: Invert = true; 9247 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; 9248 case ISD::SETLT: Swap = true; 9249 case ISD::SETGT: Opc = X86ISD::PCMPGT; break; 9250 case ISD::SETGE: Swap = true; 9251 case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; 9252 case ISD::SETULT: Swap = true; 9253 case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; 9254 case ISD::SETUGE: Swap = true; 9255 case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; 9256 } 9257 if (Swap) 9258 std::swap(Op0, Op1); 9259 9260 // Check that the operation in question is available (most are plain SSE2, 9261 // but PCMPGTQ and PCMPEQQ have different requirements). 9262 if (VT == MVT::v2i64) { 9263 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) 9264 return SDValue(); 9265 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { 9266 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with 9267 // pcmpeqd + pshufd + pand. 9268 assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); 9269 9270 // First cast everything to the right type, 9271 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 9272 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 9273 9274 // Do the compare. 9275 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); 9276 9277 // Make sure the lower and upper halves are both all-ones. 9278 const int Mask[] = { 1, 0, 3, 2 }; 9279 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); 9280 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); 9281 9282 if (Invert) 9283 Result = DAG.getNOT(dl, Result, MVT::v4i32); 9284 9285 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 9286 } 9287 } 9288 9289 // Since SSE has no unsigned integer comparisons, we need to flip the sign 9290 // bits of the inputs before performing those operations. 9291 if (FlipSigns) { 9292 EVT EltVT = VT.getVectorElementType(); 9293 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 9294 EltVT); 9295 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 9296 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 9297 SignBits.size()); 9298 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 9299 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 9300 } 9301 9302 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 9303 9304 // If the logical-not of the result is required, perform that now. 9305 if (Invert) 9306 Result = DAG.getNOT(dl, Result, VT); 9307 9308 return Result; 9309} 9310 9311// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 9312static bool isX86LogicalCmp(SDValue Op) { 9313 unsigned Opc = Op.getNode()->getOpcode(); 9314 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || 9315 Opc == X86ISD::SAHF) 9316 return true; 9317 if (Op.getResNo() == 1 && 9318 (Opc == X86ISD::ADD || 9319 Opc == X86ISD::SUB || 9320 Opc == X86ISD::ADC || 9321 Opc == X86ISD::SBB || 9322 Opc == X86ISD::SMUL || 9323 Opc == X86ISD::UMUL || 9324 Opc == X86ISD::INC || 9325 Opc == X86ISD::DEC || 9326 Opc == X86ISD::OR || 9327 Opc == X86ISD::XOR || 9328 Opc == X86ISD::AND)) 9329 return true; 9330 9331 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 9332 return true; 9333 9334 return false; 9335} 9336 9337static bool isZero(SDValue V) { 9338 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 9339 return C && C->isNullValue(); 9340} 9341 9342static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { 9343 if (V.getOpcode() != ISD::TRUNCATE) 9344 return false; 9345 9346 SDValue VOp0 = V.getOperand(0); 9347 unsigned InBits = VOp0.getValueSizeInBits(); 9348 unsigned Bits = V.getValueSizeInBits(); 9349 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); 9350} 9351 9352SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 9353 bool addTest = true; 9354 SDValue Cond = Op.getOperand(0); 9355 SDValue Op1 = Op.getOperand(1); 9356 SDValue Op2 = Op.getOperand(2); 9357 DebugLoc DL = Op.getDebugLoc(); 9358 SDValue CC; 9359 9360 if (Cond.getOpcode() == ISD::SETCC) { 9361 SDValue NewCond = LowerSETCC(Cond, DAG); 9362 if (NewCond.getNode()) 9363 Cond = NewCond; 9364 } 9365 9366 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 9367 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 9368 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 9369 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 9370 if (Cond.getOpcode() == X86ISD::SETCC && 9371 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 9372 isZero(Cond.getOperand(1).getOperand(1))) { 9373 SDValue Cmp = Cond.getOperand(1); 9374 9375 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 9376 9377 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 9378 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 9379 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 9380 9381 SDValue CmpOp0 = Cmp.getOperand(0); 9382 // Apply further optimizations for special cases 9383 // (select (x != 0), -1, 0) -> neg & sbb 9384 // (select (x == 0), 0, -1) -> neg & sbb 9385 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) 9386 if (YC->isNullValue() && 9387 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { 9388 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); 9389 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 9390 DAG.getConstant(0, CmpOp0.getValueType()), 9391 CmpOp0); 9392 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 9393 DAG.getConstant(X86::COND_B, MVT::i8), 9394 SDValue(Neg.getNode(), 1)); 9395 return Res; 9396 } 9397 9398 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 9399 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 9400 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 9401 9402 SDValue Res = // Res = 0 or -1. 9403 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 9404 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 9405 9406 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 9407 Res = DAG.getNOT(DL, Res, Res.getValueType()); 9408 9409 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 9410 if (N2C == 0 || !N2C->isNullValue()) 9411 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 9412 return Res; 9413 } 9414 } 9415 9416 // Look past (and (setcc_carry (cmp ...)), 1). 9417 if (Cond.getOpcode() == ISD::AND && 9418 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 9419 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 9420 if (C && C->getAPIntValue() == 1) 9421 Cond = Cond.getOperand(0); 9422 } 9423 9424 // If condition flag is set by a X86ISD::CMP, then use it as the condition 9425 // setting operand in place of the X86ISD::SETCC. 9426 unsigned CondOpcode = Cond.getOpcode(); 9427 if (CondOpcode == X86ISD::SETCC || 9428 CondOpcode == X86ISD::SETCC_CARRY) { 9429 CC = Cond.getOperand(0); 9430 9431 SDValue Cmp = Cond.getOperand(1); 9432 unsigned Opc = Cmp.getOpcode(); 9433 EVT VT = Op.getValueType(); 9434 9435 bool IllegalFPCMov = false; 9436 if (VT.isFloatingPoint() && !VT.isVector() && 9437 !isScalarFPTypeInSSEReg(VT)) // FPStack? 9438 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 9439 9440 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 9441 Opc == X86ISD::BT) { // FIXME 9442 Cond = Cmp; 9443 addTest = false; 9444 } 9445 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 9446 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 9447 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 9448 Cond.getOperand(0).getValueType() != MVT::i8)) { 9449 SDValue LHS = Cond.getOperand(0); 9450 SDValue RHS = Cond.getOperand(1); 9451 unsigned X86Opcode; 9452 unsigned X86Cond; 9453 SDVTList VTs; 9454 switch (CondOpcode) { 9455 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 9456 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 9457 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 9458 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 9459 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 9460 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 9461 default: llvm_unreachable("unexpected overflowing operator"); 9462 } 9463 if (CondOpcode == ISD::UMULO) 9464 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 9465 MVT::i32); 9466 else 9467 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 9468 9469 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 9470 9471 if (CondOpcode == ISD::UMULO) 9472 Cond = X86Op.getValue(2); 9473 else 9474 Cond = X86Op.getValue(1); 9475 9476 CC = DAG.getConstant(X86Cond, MVT::i8); 9477 addTest = false; 9478 } 9479 9480 if (addTest) { 9481 // Look pass the truncate if the high bits are known zero. 9482 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 9483 Cond = Cond.getOperand(0); 9484 9485 // We know the result of AND is compared against zero. Try to match 9486 // it to BT. 9487 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 9488 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 9489 if (NewSetCC.getNode()) { 9490 CC = NewSetCC.getOperand(0); 9491 Cond = NewSetCC.getOperand(1); 9492 addTest = false; 9493 } 9494 } 9495 } 9496 9497 if (addTest) { 9498 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9499 Cond = EmitTest(Cond, X86::COND_NE, DAG); 9500 } 9501 9502 // a < b ? -1 : 0 -> RES = ~setcc_carry 9503 // a < b ? 0 : -1 -> RES = setcc_carry 9504 // a >= b ? -1 : 0 -> RES = setcc_carry 9505 // a >= b ? 0 : -1 -> RES = ~setcc_carry 9506 if (Cond.getOpcode() == X86ISD::SUB) { 9507 Cond = ConvertCmpIfNecessary(Cond, DAG); 9508 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 9509 9510 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 9511 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 9512 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 9513 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 9514 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 9515 return DAG.getNOT(DL, Res, Res.getValueType()); 9516 return Res; 9517 } 9518 } 9519 9520 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate 9521 // widen the cmov and push the truncate through. This avoids introducing a new 9522 // branch during isel and doesn't add any extensions. 9523 if (Op.getValueType() == MVT::i8 && 9524 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { 9525 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); 9526 if (T1.getValueType() == T2.getValueType() && 9527 // Blacklist CopyFromReg to avoid partial register stalls. 9528 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ 9529 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); 9530 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); 9531 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); 9532 } 9533 } 9534 9535 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 9536 // condition is true. 9537 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 9538 SDValue Ops[] = { Op2, Op1, CC, Cond }; 9539 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 9540} 9541 9542// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 9543// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 9544// from the AND / OR. 9545static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 9546 Opc = Op.getOpcode(); 9547 if (Opc != ISD::OR && Opc != ISD::AND) 9548 return false; 9549 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 9550 Op.getOperand(0).hasOneUse() && 9551 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 9552 Op.getOperand(1).hasOneUse()); 9553} 9554 9555// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 9556// 1 and that the SETCC node has a single use. 9557static bool isXor1OfSetCC(SDValue Op) { 9558 if (Op.getOpcode() != ISD::XOR) 9559 return false; 9560 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 9561 if (N1C && N1C->getAPIntValue() == 1) { 9562 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 9563 Op.getOperand(0).hasOneUse(); 9564 } 9565 return false; 9566} 9567 9568SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 9569 bool addTest = true; 9570 SDValue Chain = Op.getOperand(0); 9571 SDValue Cond = Op.getOperand(1); 9572 SDValue Dest = Op.getOperand(2); 9573 DebugLoc dl = Op.getDebugLoc(); 9574 SDValue CC; 9575 bool Inverted = false; 9576 9577 if (Cond.getOpcode() == ISD::SETCC) { 9578 // Check for setcc([su]{add,sub,mul}o == 0). 9579 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 9580 isa<ConstantSDNode>(Cond.getOperand(1)) && 9581 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && 9582 Cond.getOperand(0).getResNo() == 1 && 9583 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 9584 Cond.getOperand(0).getOpcode() == ISD::UADDO || 9585 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 9586 Cond.getOperand(0).getOpcode() == ISD::USUBO || 9587 Cond.getOperand(0).getOpcode() == ISD::SMULO || 9588 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 9589 Inverted = true; 9590 Cond = Cond.getOperand(0); 9591 } else { 9592 SDValue NewCond = LowerSETCC(Cond, DAG); 9593 if (NewCond.getNode()) 9594 Cond = NewCond; 9595 } 9596 } 9597#if 0 9598 // FIXME: LowerXALUO doesn't handle these!! 9599 else if (Cond.getOpcode() == X86ISD::ADD || 9600 Cond.getOpcode() == X86ISD::SUB || 9601 Cond.getOpcode() == X86ISD::SMUL || 9602 Cond.getOpcode() == X86ISD::UMUL) 9603 Cond = LowerXALUO(Cond, DAG); 9604#endif 9605 9606 // Look pass (and (setcc_carry (cmp ...)), 1). 9607 if (Cond.getOpcode() == ISD::AND && 9608 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 9609 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 9610 if (C && C->getAPIntValue() == 1) 9611 Cond = Cond.getOperand(0); 9612 } 9613 9614 // If condition flag is set by a X86ISD::CMP, then use it as the condition 9615 // setting operand in place of the X86ISD::SETCC. 9616 unsigned CondOpcode = Cond.getOpcode(); 9617 if (CondOpcode == X86ISD::SETCC || 9618 CondOpcode == X86ISD::SETCC_CARRY) { 9619 CC = Cond.getOperand(0); 9620 9621 SDValue Cmp = Cond.getOperand(1); 9622 unsigned Opc = Cmp.getOpcode(); 9623 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 9624 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 9625 Cond = Cmp; 9626 addTest = false; 9627 } else { 9628 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 9629 default: break; 9630 case X86::COND_O: 9631 case X86::COND_B: 9632 // These can only come from an arithmetic instruction with overflow, 9633 // e.g. SADDO, UADDO. 9634 Cond = Cond.getNode()->getOperand(1); 9635 addTest = false; 9636 break; 9637 } 9638 } 9639 } 9640 CondOpcode = Cond.getOpcode(); 9641 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 9642 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 9643 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 9644 Cond.getOperand(0).getValueType() != MVT::i8)) { 9645 SDValue LHS = Cond.getOperand(0); 9646 SDValue RHS = Cond.getOperand(1); 9647 unsigned X86Opcode; 9648 unsigned X86Cond; 9649 SDVTList VTs; 9650 switch (CondOpcode) { 9651 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 9652 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 9653 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 9654 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 9655 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 9656 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 9657 default: llvm_unreachable("unexpected overflowing operator"); 9658 } 9659 if (Inverted) 9660 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 9661 if (CondOpcode == ISD::UMULO) 9662 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 9663 MVT::i32); 9664 else 9665 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 9666 9667 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 9668 9669 if (CondOpcode == ISD::UMULO) 9670 Cond = X86Op.getValue(2); 9671 else 9672 Cond = X86Op.getValue(1); 9673 9674 CC = DAG.getConstant(X86Cond, MVT::i8); 9675 addTest = false; 9676 } else { 9677 unsigned CondOpc; 9678 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 9679 SDValue Cmp = Cond.getOperand(0).getOperand(1); 9680 if (CondOpc == ISD::OR) { 9681 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 9682 // two branches instead of an explicit OR instruction with a 9683 // separate test. 9684 if (Cmp == Cond.getOperand(1).getOperand(1) && 9685 isX86LogicalCmp(Cmp)) { 9686 CC = Cond.getOperand(0).getOperand(0); 9687 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9688 Chain, Dest, CC, Cmp); 9689 CC = Cond.getOperand(1).getOperand(0); 9690 Cond = Cmp; 9691 addTest = false; 9692 } 9693 } else { // ISD::AND 9694 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 9695 // two branches instead of an explicit AND instruction with a 9696 // separate test. However, we only do this if this block doesn't 9697 // have a fall-through edge, because this requires an explicit 9698 // jmp when the condition is false. 9699 if (Cmp == Cond.getOperand(1).getOperand(1) && 9700 isX86LogicalCmp(Cmp) && 9701 Op.getNode()->hasOneUse()) { 9702 X86::CondCode CCode = 9703 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 9704 CCode = X86::GetOppositeBranchCondition(CCode); 9705 CC = DAG.getConstant(CCode, MVT::i8); 9706 SDNode *User = *Op.getNode()->use_begin(); 9707 // Look for an unconditional branch following this conditional branch. 9708 // We need this because we need to reverse the successors in order 9709 // to implement FCMP_OEQ. 9710 if (User->getOpcode() == ISD::BR) { 9711 SDValue FalseBB = User->getOperand(1); 9712 SDNode *NewBR = 9713 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 9714 assert(NewBR == User); 9715 (void)NewBR; 9716 Dest = FalseBB; 9717 9718 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9719 Chain, Dest, CC, Cmp); 9720 X86::CondCode CCode = 9721 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 9722 CCode = X86::GetOppositeBranchCondition(CCode); 9723 CC = DAG.getConstant(CCode, MVT::i8); 9724 Cond = Cmp; 9725 addTest = false; 9726 } 9727 } 9728 } 9729 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 9730 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 9731 // It should be transformed during dag combiner except when the condition 9732 // is set by a arithmetics with overflow node. 9733 X86::CondCode CCode = 9734 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 9735 CCode = X86::GetOppositeBranchCondition(CCode); 9736 CC = DAG.getConstant(CCode, MVT::i8); 9737 Cond = Cond.getOperand(0).getOperand(1); 9738 addTest = false; 9739 } else if (Cond.getOpcode() == ISD::SETCC && 9740 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 9741 // For FCMP_OEQ, we can emit 9742 // two branches instead of an explicit AND instruction with a 9743 // separate test. However, we only do this if this block doesn't 9744 // have a fall-through edge, because this requires an explicit 9745 // jmp when the condition is false. 9746 if (Op.getNode()->hasOneUse()) { 9747 SDNode *User = *Op.getNode()->use_begin(); 9748 // Look for an unconditional branch following this conditional branch. 9749 // We need this because we need to reverse the successors in order 9750 // to implement FCMP_OEQ. 9751 if (User->getOpcode() == ISD::BR) { 9752 SDValue FalseBB = User->getOperand(1); 9753 SDNode *NewBR = 9754 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 9755 assert(NewBR == User); 9756 (void)NewBR; 9757 Dest = FalseBB; 9758 9759 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 9760 Cond.getOperand(0), Cond.getOperand(1)); 9761 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 9762 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9763 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9764 Chain, Dest, CC, Cmp); 9765 CC = DAG.getConstant(X86::COND_P, MVT::i8); 9766 Cond = Cmp; 9767 addTest = false; 9768 } 9769 } 9770 } else if (Cond.getOpcode() == ISD::SETCC && 9771 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 9772 // For FCMP_UNE, we can emit 9773 // two branches instead of an explicit AND instruction with a 9774 // separate test. However, we only do this if this block doesn't 9775 // have a fall-through edge, because this requires an explicit 9776 // jmp when the condition is false. 9777 if (Op.getNode()->hasOneUse()) { 9778 SDNode *User = *Op.getNode()->use_begin(); 9779 // Look for an unconditional branch following this conditional branch. 9780 // We need this because we need to reverse the successors in order 9781 // to implement FCMP_UNE. 9782 if (User->getOpcode() == ISD::BR) { 9783 SDValue FalseBB = User->getOperand(1); 9784 SDNode *NewBR = 9785 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 9786 assert(NewBR == User); 9787 (void)NewBR; 9788 9789 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 9790 Cond.getOperand(0), Cond.getOperand(1)); 9791 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 9792 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9793 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9794 Chain, Dest, CC, Cmp); 9795 CC = DAG.getConstant(X86::COND_NP, MVT::i8); 9796 Cond = Cmp; 9797 addTest = false; 9798 Dest = FalseBB; 9799 } 9800 } 9801 } 9802 } 9803 9804 if (addTest) { 9805 // Look pass the truncate if the high bits are known zero. 9806 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 9807 Cond = Cond.getOperand(0); 9808 9809 // We know the result of AND is compared against zero. Try to match 9810 // it to BT. 9811 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 9812 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 9813 if (NewSetCC.getNode()) { 9814 CC = NewSetCC.getOperand(0); 9815 Cond = NewSetCC.getOperand(1); 9816 addTest = false; 9817 } 9818 } 9819 } 9820 9821 if (addTest) { 9822 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9823 Cond = EmitTest(Cond, X86::COND_NE, DAG); 9824 } 9825 Cond = ConvertCmpIfNecessary(Cond, DAG); 9826 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9827 Chain, Dest, CC, Cond); 9828} 9829 9830// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 9831// Calls to _alloca is needed to probe the stack when allocating more than 4k 9832// bytes in one go. Touching the stack at 4K increments is necessary to ensure 9833// that the guard pages used by the OS virtual memory manager are allocated in 9834// correct sequence. 9835SDValue 9836X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 9837 SelectionDAG &DAG) const { 9838 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || 9839 getTargetMachine().Options.EnableSegmentedStacks) && 9840 "This should be used only on Windows targets or when segmented stacks " 9841 "are being used"); 9842 assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); 9843 DebugLoc dl = Op.getDebugLoc(); 9844 9845 // Get the inputs. 9846 SDValue Chain = Op.getOperand(0); 9847 SDValue Size = Op.getOperand(1); 9848 // FIXME: Ensure alignment here 9849 9850 bool Is64Bit = Subtarget->is64Bit(); 9851 EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; 9852 9853 if (getTargetMachine().Options.EnableSegmentedStacks) { 9854 MachineFunction &MF = DAG.getMachineFunction(); 9855 MachineRegisterInfo &MRI = MF.getRegInfo(); 9856 9857 if (Is64Bit) { 9858 // The 64 bit implementation of segmented stacks needs to clobber both r10 9859 // r11. This makes it impossible to use it along with nested parameters. 9860 const Function *F = MF.getFunction(); 9861 9862 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 9863 I != E; ++I) 9864 if (I->hasNestAttr()) 9865 report_fatal_error("Cannot use segmented stacks with functions that " 9866 "have nested arguments."); 9867 } 9868 9869 const TargetRegisterClass *AddrRegClass = 9870 getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); 9871 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 9872 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 9873 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 9874 DAG.getRegister(Vreg, SPTy)); 9875 SDValue Ops1[2] = { Value, Chain }; 9876 return DAG.getMergeValues(Ops1, 2, dl); 9877 } else { 9878 SDValue Flag; 9879 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 9880 9881 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 9882 Flag = Chain.getValue(1); 9883 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 9884 9885 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 9886 Flag = Chain.getValue(1); 9887 9888 Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 9889 SPTy).getValue(1); 9890 9891 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 9892 return DAG.getMergeValues(Ops1, 2, dl); 9893 } 9894} 9895 9896SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 9897 MachineFunction &MF = DAG.getMachineFunction(); 9898 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 9899 9900 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9901 DebugLoc DL = Op.getDebugLoc(); 9902 9903 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 9904 // vastart just stores the address of the VarArgsFrameIndex slot into the 9905 // memory location argument. 9906 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 9907 getPointerTy()); 9908 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 9909 MachinePointerInfo(SV), false, false, 0); 9910 } 9911 9912 // __va_list_tag: 9913 // gp_offset (0 - 6 * 8) 9914 // fp_offset (48 - 48 + 8 * 16) 9915 // overflow_arg_area (point to parameters coming in memory). 9916 // reg_save_area 9917 SmallVector<SDValue, 8> MemOps; 9918 SDValue FIN = Op.getOperand(1); 9919 // Store gp_offset 9920 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 9921 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 9922 MVT::i32), 9923 FIN, MachinePointerInfo(SV), false, false, 0); 9924 MemOps.push_back(Store); 9925 9926 // Store fp_offset 9927 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9928 FIN, DAG.getIntPtrConstant(4)); 9929 Store = DAG.getStore(Op.getOperand(0), DL, 9930 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 9931 MVT::i32), 9932 FIN, MachinePointerInfo(SV, 4), false, false, 0); 9933 MemOps.push_back(Store); 9934 9935 // Store ptr to overflow_arg_area 9936 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9937 FIN, DAG.getIntPtrConstant(4)); 9938 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 9939 getPointerTy()); 9940 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 9941 MachinePointerInfo(SV, 8), 9942 false, false, 0); 9943 MemOps.push_back(Store); 9944 9945 // Store ptr to reg_save_area. 9946 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9947 FIN, DAG.getIntPtrConstant(8)); 9948 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 9949 getPointerTy()); 9950 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 9951 MachinePointerInfo(SV, 16), false, false, 0); 9952 MemOps.push_back(Store); 9953 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 9954 &MemOps[0], MemOps.size()); 9955} 9956 9957SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 9958 assert(Subtarget->is64Bit() && 9959 "LowerVAARG only handles 64-bit va_arg!"); 9960 assert((Subtarget->isTargetLinux() || 9961 Subtarget->isTargetDarwin()) && 9962 "Unhandled target in LowerVAARG"); 9963 assert(Op.getNode()->getNumOperands() == 4); 9964 SDValue Chain = Op.getOperand(0); 9965 SDValue SrcPtr = Op.getOperand(1); 9966 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9967 unsigned Align = Op.getConstantOperandVal(3); 9968 DebugLoc dl = Op.getDebugLoc(); 9969 9970 EVT ArgVT = Op.getNode()->getValueType(0); 9971 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 9972 uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); 9973 uint8_t ArgMode; 9974 9975 // Decide which area this value should be read from. 9976 // TODO: Implement the AMD64 ABI in its entirety. This simple 9977 // selection mechanism works only for the basic types. 9978 if (ArgVT == MVT::f80) { 9979 llvm_unreachable("va_arg for f80 not yet implemented"); 9980 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 9981 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 9982 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 9983 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 9984 } else { 9985 llvm_unreachable("Unhandled argument type in LowerVAARG"); 9986 } 9987 9988 if (ArgMode == 2) { 9989 // Sanity Check: Make sure using fp_offset makes sense. 9990 assert(!getTargetMachine().Options.UseSoftFloat && 9991 !(DAG.getMachineFunction() 9992 .getFunction()->getFnAttributes() 9993 .hasAttribute(Attribute::NoImplicitFloat)) && 9994 Subtarget->hasSSE1()); 9995 } 9996 9997 // Insert VAARG_64 node into the DAG 9998 // VAARG_64 returns two values: Variable Argument Address, Chain 9999 SmallVector<SDValue, 11> InstOps; 10000 InstOps.push_back(Chain); 10001 InstOps.push_back(SrcPtr); 10002 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 10003 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 10004 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 10005 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 10006 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 10007 VTs, &InstOps[0], InstOps.size(), 10008 MVT::i64, 10009 MachinePointerInfo(SV), 10010 /*Align=*/0, 10011 /*Volatile=*/false, 10012 /*ReadMem=*/true, 10013 /*WriteMem=*/true); 10014 Chain = VAARG.getValue(1); 10015 10016 // Load the next argument and return it 10017 return DAG.getLoad(ArgVT, dl, 10018 Chain, 10019 VAARG, 10020 MachinePointerInfo(), 10021 false, false, false, 0); 10022} 10023 10024static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, 10025 SelectionDAG &DAG) { 10026 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 10027 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 10028 SDValue Chain = Op.getOperand(0); 10029 SDValue DstPtr = Op.getOperand(1); 10030 SDValue SrcPtr = Op.getOperand(2); 10031 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 10032 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 10033 DebugLoc DL = Op.getDebugLoc(); 10034 10035 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 10036 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 10037 false, 10038 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 10039} 10040 10041// getTargetVShiftNOde - Handle vector element shifts where the shift amount 10042// may or may not be a constant. Takes immediate version of shift as input. 10043static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, 10044 SDValue SrcOp, SDValue ShAmt, 10045 SelectionDAG &DAG) { 10046 assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); 10047 10048 if (isa<ConstantSDNode>(ShAmt)) { 10049 // Constant may be a TargetConstant. Use a regular constant. 10050 uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 10051 switch (Opc) { 10052 default: llvm_unreachable("Unknown target vector shift node"); 10053 case X86ISD::VSHLI: 10054 case X86ISD::VSRLI: 10055 case X86ISD::VSRAI: 10056 return DAG.getNode(Opc, dl, VT, SrcOp, 10057 DAG.getConstant(ShiftAmt, MVT::i32)); 10058 } 10059 } 10060 10061 // Change opcode to non-immediate version 10062 switch (Opc) { 10063 default: llvm_unreachable("Unknown target vector shift node"); 10064 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 10065 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 10066 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 10067 } 10068 10069 // Need to build a vector containing shift amount 10070 // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 10071 SDValue ShOps[4]; 10072 ShOps[0] = ShAmt; 10073 ShOps[1] = DAG.getConstant(0, MVT::i32); 10074 ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); 10075 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); 10076 10077 // The return type has to be a 128-bit type with the same element 10078 // type as the input type. 10079 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10080 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); 10081 10082 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); 10083 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 10084} 10085 10086static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 10087 DebugLoc dl = Op.getDebugLoc(); 10088 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 10089 switch (IntNo) { 10090 default: return SDValue(); // Don't custom lower most intrinsics. 10091 // Comparison intrinsics. 10092 case Intrinsic::x86_sse_comieq_ss: 10093 case Intrinsic::x86_sse_comilt_ss: 10094 case Intrinsic::x86_sse_comile_ss: 10095 case Intrinsic::x86_sse_comigt_ss: 10096 case Intrinsic::x86_sse_comige_ss: 10097 case Intrinsic::x86_sse_comineq_ss: 10098 case Intrinsic::x86_sse_ucomieq_ss: 10099 case Intrinsic::x86_sse_ucomilt_ss: 10100 case Intrinsic::x86_sse_ucomile_ss: 10101 case Intrinsic::x86_sse_ucomigt_ss: 10102 case Intrinsic::x86_sse_ucomige_ss: 10103 case Intrinsic::x86_sse_ucomineq_ss: 10104 case Intrinsic::x86_sse2_comieq_sd: 10105 case Intrinsic::x86_sse2_comilt_sd: 10106 case Intrinsic::x86_sse2_comile_sd: 10107 case Intrinsic::x86_sse2_comigt_sd: 10108 case Intrinsic::x86_sse2_comige_sd: 10109 case Intrinsic::x86_sse2_comineq_sd: 10110 case Intrinsic::x86_sse2_ucomieq_sd: 10111 case Intrinsic::x86_sse2_ucomilt_sd: 10112 case Intrinsic::x86_sse2_ucomile_sd: 10113 case Intrinsic::x86_sse2_ucomigt_sd: 10114 case Intrinsic::x86_sse2_ucomige_sd: 10115 case Intrinsic::x86_sse2_ucomineq_sd: { 10116 unsigned Opc; 10117 ISD::CondCode CC; 10118 switch (IntNo) { 10119 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10120 case Intrinsic::x86_sse_comieq_ss: 10121 case Intrinsic::x86_sse2_comieq_sd: 10122 Opc = X86ISD::COMI; 10123 CC = ISD::SETEQ; 10124 break; 10125 case Intrinsic::x86_sse_comilt_ss: 10126 case Intrinsic::x86_sse2_comilt_sd: 10127 Opc = X86ISD::COMI; 10128 CC = ISD::SETLT; 10129 break; 10130 case Intrinsic::x86_sse_comile_ss: 10131 case Intrinsic::x86_sse2_comile_sd: 10132 Opc = X86ISD::COMI; 10133 CC = ISD::SETLE; 10134 break; 10135 case Intrinsic::x86_sse_comigt_ss: 10136 case Intrinsic::x86_sse2_comigt_sd: 10137 Opc = X86ISD::COMI; 10138 CC = ISD::SETGT; 10139 break; 10140 case Intrinsic::x86_sse_comige_ss: 10141 case Intrinsic::x86_sse2_comige_sd: 10142 Opc = X86ISD::COMI; 10143 CC = ISD::SETGE; 10144 break; 10145 case Intrinsic::x86_sse_comineq_ss: 10146 case Intrinsic::x86_sse2_comineq_sd: 10147 Opc = X86ISD::COMI; 10148 CC = ISD::SETNE; 10149 break; 10150 case Intrinsic::x86_sse_ucomieq_ss: 10151 case Intrinsic::x86_sse2_ucomieq_sd: 10152 Opc = X86ISD::UCOMI; 10153 CC = ISD::SETEQ; 10154 break; 10155 case Intrinsic::x86_sse_ucomilt_ss: 10156 case Intrinsic::x86_sse2_ucomilt_sd: 10157 Opc = X86ISD::UCOMI; 10158 CC = ISD::SETLT; 10159 break; 10160 case Intrinsic::x86_sse_ucomile_ss: 10161 case Intrinsic::x86_sse2_ucomile_sd: 10162 Opc = X86ISD::UCOMI; 10163 CC = ISD::SETLE; 10164 break; 10165 case Intrinsic::x86_sse_ucomigt_ss: 10166 case Intrinsic::x86_sse2_ucomigt_sd: 10167 Opc = X86ISD::UCOMI; 10168 CC = ISD::SETGT; 10169 break; 10170 case Intrinsic::x86_sse_ucomige_ss: 10171 case Intrinsic::x86_sse2_ucomige_sd: 10172 Opc = X86ISD::UCOMI; 10173 CC = ISD::SETGE; 10174 break; 10175 case Intrinsic::x86_sse_ucomineq_ss: 10176 case Intrinsic::x86_sse2_ucomineq_sd: 10177 Opc = X86ISD::UCOMI; 10178 CC = ISD::SETNE; 10179 break; 10180 } 10181 10182 SDValue LHS = Op.getOperand(1); 10183 SDValue RHS = Op.getOperand(2); 10184 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 10185 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 10186 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 10187 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 10188 DAG.getConstant(X86CC, MVT::i8), Cond); 10189 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 10190 } 10191 10192 // Arithmetic intrinsics. 10193 case Intrinsic::x86_sse2_pmulu_dq: 10194 case Intrinsic::x86_avx2_pmulu_dq: 10195 return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), 10196 Op.getOperand(1), Op.getOperand(2)); 10197 10198 // SSE2/AVX2 sub with unsigned saturation intrinsics 10199 case Intrinsic::x86_sse2_psubus_b: 10200 case Intrinsic::x86_sse2_psubus_w: 10201 case Intrinsic::x86_avx2_psubus_b: 10202 case Intrinsic::x86_avx2_psubus_w: 10203 return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(), 10204 Op.getOperand(1), Op.getOperand(2)); 10205 10206 // SSE3/AVX horizontal add/sub intrinsics 10207 case Intrinsic::x86_sse3_hadd_ps: 10208 case Intrinsic::x86_sse3_hadd_pd: 10209 case Intrinsic::x86_avx_hadd_ps_256: 10210 case Intrinsic::x86_avx_hadd_pd_256: 10211 case Intrinsic::x86_sse3_hsub_ps: 10212 case Intrinsic::x86_sse3_hsub_pd: 10213 case Intrinsic::x86_avx_hsub_ps_256: 10214 case Intrinsic::x86_avx_hsub_pd_256: 10215 case Intrinsic::x86_ssse3_phadd_w_128: 10216 case Intrinsic::x86_ssse3_phadd_d_128: 10217 case Intrinsic::x86_avx2_phadd_w: 10218 case Intrinsic::x86_avx2_phadd_d: 10219 case Intrinsic::x86_ssse3_phsub_w_128: 10220 case Intrinsic::x86_ssse3_phsub_d_128: 10221 case Intrinsic::x86_avx2_phsub_w: 10222 case Intrinsic::x86_avx2_phsub_d: { 10223 unsigned Opcode; 10224 switch (IntNo) { 10225 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10226 case Intrinsic::x86_sse3_hadd_ps: 10227 case Intrinsic::x86_sse3_hadd_pd: 10228 case Intrinsic::x86_avx_hadd_ps_256: 10229 case Intrinsic::x86_avx_hadd_pd_256: 10230 Opcode = X86ISD::FHADD; 10231 break; 10232 case Intrinsic::x86_sse3_hsub_ps: 10233 case Intrinsic::x86_sse3_hsub_pd: 10234 case Intrinsic::x86_avx_hsub_ps_256: 10235 case Intrinsic::x86_avx_hsub_pd_256: 10236 Opcode = X86ISD::FHSUB; 10237 break; 10238 case Intrinsic::x86_ssse3_phadd_w_128: 10239 case Intrinsic::x86_ssse3_phadd_d_128: 10240 case Intrinsic::x86_avx2_phadd_w: 10241 case Intrinsic::x86_avx2_phadd_d: 10242 Opcode = X86ISD::HADD; 10243 break; 10244 case Intrinsic::x86_ssse3_phsub_w_128: 10245 case Intrinsic::x86_ssse3_phsub_d_128: 10246 case Intrinsic::x86_avx2_phsub_w: 10247 case Intrinsic::x86_avx2_phsub_d: 10248 Opcode = X86ISD::HSUB; 10249 break; 10250 } 10251 return DAG.getNode(Opcode, dl, Op.getValueType(), 10252 Op.getOperand(1), Op.getOperand(2)); 10253 } 10254 10255 // SSE2/SSE41/AVX2 integer max/min intrinsics. 10256 case Intrinsic::x86_sse2_pmaxu_b: 10257 case Intrinsic::x86_sse41_pmaxuw: 10258 case Intrinsic::x86_sse41_pmaxud: 10259 case Intrinsic::x86_avx2_pmaxu_b: 10260 case Intrinsic::x86_avx2_pmaxu_w: 10261 case Intrinsic::x86_avx2_pmaxu_d: 10262 return DAG.getNode(X86ISD::UMAX, dl, Op.getValueType(), 10263 Op.getOperand(1), Op.getOperand(2)); 10264 case Intrinsic::x86_sse2_pminu_b: 10265 case Intrinsic::x86_sse41_pminuw: 10266 case Intrinsic::x86_sse41_pminud: 10267 case Intrinsic::x86_avx2_pminu_b: 10268 case Intrinsic::x86_avx2_pminu_w: 10269 case Intrinsic::x86_avx2_pminu_d: 10270 return DAG.getNode(X86ISD::UMIN, dl, Op.getValueType(), 10271 Op.getOperand(1), Op.getOperand(2)); 10272 case Intrinsic::x86_sse41_pmaxsb: 10273 case Intrinsic::x86_sse2_pmaxs_w: 10274 case Intrinsic::x86_sse41_pmaxsd: 10275 case Intrinsic::x86_avx2_pmaxs_b: 10276 case Intrinsic::x86_avx2_pmaxs_w: 10277 case Intrinsic::x86_avx2_pmaxs_d: 10278 return DAG.getNode(X86ISD::SMAX, dl, Op.getValueType(), 10279 Op.getOperand(1), Op.getOperand(2)); 10280 case Intrinsic::x86_sse41_pminsb: 10281 case Intrinsic::x86_sse2_pmins_w: 10282 case Intrinsic::x86_sse41_pminsd: 10283 case Intrinsic::x86_avx2_pmins_b: 10284 case Intrinsic::x86_avx2_pmins_w: 10285 case Intrinsic::x86_avx2_pmins_d: 10286 return DAG.getNode(X86ISD::SMIN, dl, Op.getValueType(), 10287 Op.getOperand(1), Op.getOperand(2)); 10288 10289 // AVX2 variable shift intrinsics 10290 case Intrinsic::x86_avx2_psllv_d: 10291 case Intrinsic::x86_avx2_psllv_q: 10292 case Intrinsic::x86_avx2_psllv_d_256: 10293 case Intrinsic::x86_avx2_psllv_q_256: 10294 case Intrinsic::x86_avx2_psrlv_d: 10295 case Intrinsic::x86_avx2_psrlv_q: 10296 case Intrinsic::x86_avx2_psrlv_d_256: 10297 case Intrinsic::x86_avx2_psrlv_q_256: 10298 case Intrinsic::x86_avx2_psrav_d: 10299 case Intrinsic::x86_avx2_psrav_d_256: { 10300 unsigned Opcode; 10301 switch (IntNo) { 10302 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10303 case Intrinsic::x86_avx2_psllv_d: 10304 case Intrinsic::x86_avx2_psllv_q: 10305 case Intrinsic::x86_avx2_psllv_d_256: 10306 case Intrinsic::x86_avx2_psllv_q_256: 10307 Opcode = ISD::SHL; 10308 break; 10309 case Intrinsic::x86_avx2_psrlv_d: 10310 case Intrinsic::x86_avx2_psrlv_q: 10311 case Intrinsic::x86_avx2_psrlv_d_256: 10312 case Intrinsic::x86_avx2_psrlv_q_256: 10313 Opcode = ISD::SRL; 10314 break; 10315 case Intrinsic::x86_avx2_psrav_d: 10316 case Intrinsic::x86_avx2_psrav_d_256: 10317 Opcode = ISD::SRA; 10318 break; 10319 } 10320 return DAG.getNode(Opcode, dl, Op.getValueType(), 10321 Op.getOperand(1), Op.getOperand(2)); 10322 } 10323 10324 case Intrinsic::x86_ssse3_pshuf_b_128: 10325 case Intrinsic::x86_avx2_pshuf_b: 10326 return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), 10327 Op.getOperand(1), Op.getOperand(2)); 10328 10329 case Intrinsic::x86_ssse3_psign_b_128: 10330 case Intrinsic::x86_ssse3_psign_w_128: 10331 case Intrinsic::x86_ssse3_psign_d_128: 10332 case Intrinsic::x86_avx2_psign_b: 10333 case Intrinsic::x86_avx2_psign_w: 10334 case Intrinsic::x86_avx2_psign_d: 10335 return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), 10336 Op.getOperand(1), Op.getOperand(2)); 10337 10338 case Intrinsic::x86_sse41_insertps: 10339 return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), 10340 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 10341 10342 case Intrinsic::x86_avx_vperm2f128_ps_256: 10343 case Intrinsic::x86_avx_vperm2f128_pd_256: 10344 case Intrinsic::x86_avx_vperm2f128_si_256: 10345 case Intrinsic::x86_avx2_vperm2i128: 10346 return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), 10347 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 10348 10349 case Intrinsic::x86_avx2_permd: 10350 case Intrinsic::x86_avx2_permps: 10351 // Operands intentionally swapped. Mask is last operand to intrinsic, 10352 // but second operand for node/intruction. 10353 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), 10354 Op.getOperand(2), Op.getOperand(1)); 10355 10356 // ptest and testp intrinsics. The intrinsic these come from are designed to 10357 // return an integer value, not just an instruction so lower it to the ptest 10358 // or testp pattern and a setcc for the result. 10359 case Intrinsic::x86_sse41_ptestz: 10360 case Intrinsic::x86_sse41_ptestc: 10361 case Intrinsic::x86_sse41_ptestnzc: 10362 case Intrinsic::x86_avx_ptestz_256: 10363 case Intrinsic::x86_avx_ptestc_256: 10364 case Intrinsic::x86_avx_ptestnzc_256: 10365 case Intrinsic::x86_avx_vtestz_ps: 10366 case Intrinsic::x86_avx_vtestc_ps: 10367 case Intrinsic::x86_avx_vtestnzc_ps: 10368 case Intrinsic::x86_avx_vtestz_pd: 10369 case Intrinsic::x86_avx_vtestc_pd: 10370 case Intrinsic::x86_avx_vtestnzc_pd: 10371 case Intrinsic::x86_avx_vtestz_ps_256: 10372 case Intrinsic::x86_avx_vtestc_ps_256: 10373 case Intrinsic::x86_avx_vtestnzc_ps_256: 10374 case Intrinsic::x86_avx_vtestz_pd_256: 10375 case Intrinsic::x86_avx_vtestc_pd_256: 10376 case Intrinsic::x86_avx_vtestnzc_pd_256: { 10377 bool IsTestPacked = false; 10378 unsigned X86CC; 10379 switch (IntNo) { 10380 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 10381 case Intrinsic::x86_avx_vtestz_ps: 10382 case Intrinsic::x86_avx_vtestz_pd: 10383 case Intrinsic::x86_avx_vtestz_ps_256: 10384 case Intrinsic::x86_avx_vtestz_pd_256: 10385 IsTestPacked = true; // Fallthrough 10386 case Intrinsic::x86_sse41_ptestz: 10387 case Intrinsic::x86_avx_ptestz_256: 10388 // ZF = 1 10389 X86CC = X86::COND_E; 10390 break; 10391 case Intrinsic::x86_avx_vtestc_ps: 10392 case Intrinsic::x86_avx_vtestc_pd: 10393 case Intrinsic::x86_avx_vtestc_ps_256: 10394 case Intrinsic::x86_avx_vtestc_pd_256: 10395 IsTestPacked = true; // Fallthrough 10396 case Intrinsic::x86_sse41_ptestc: 10397 case Intrinsic::x86_avx_ptestc_256: 10398 // CF = 1 10399 X86CC = X86::COND_B; 10400 break; 10401 case Intrinsic::x86_avx_vtestnzc_ps: 10402 case Intrinsic::x86_avx_vtestnzc_pd: 10403 case Intrinsic::x86_avx_vtestnzc_ps_256: 10404 case Intrinsic::x86_avx_vtestnzc_pd_256: 10405 IsTestPacked = true; // Fallthrough 10406 case Intrinsic::x86_sse41_ptestnzc: 10407 case Intrinsic::x86_avx_ptestnzc_256: 10408 // ZF and CF = 0 10409 X86CC = X86::COND_A; 10410 break; 10411 } 10412 10413 SDValue LHS = Op.getOperand(1); 10414 SDValue RHS = Op.getOperand(2); 10415 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 10416 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 10417 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 10418 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 10419 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 10420 } 10421 10422 // SSE/AVX shift intrinsics 10423 case Intrinsic::x86_sse2_psll_w: 10424 case Intrinsic::x86_sse2_psll_d: 10425 case Intrinsic::x86_sse2_psll_q: 10426 case Intrinsic::x86_avx2_psll_w: 10427 case Intrinsic::x86_avx2_psll_d: 10428 case Intrinsic::x86_avx2_psll_q: 10429 case Intrinsic::x86_sse2_psrl_w: 10430 case Intrinsic::x86_sse2_psrl_d: 10431 case Intrinsic::x86_sse2_psrl_q: 10432 case Intrinsic::x86_avx2_psrl_w: 10433 case Intrinsic::x86_avx2_psrl_d: 10434 case Intrinsic::x86_avx2_psrl_q: 10435 case Intrinsic::x86_sse2_psra_w: 10436 case Intrinsic::x86_sse2_psra_d: 10437 case Intrinsic::x86_avx2_psra_w: 10438 case Intrinsic::x86_avx2_psra_d: { 10439 unsigned Opcode; 10440 switch (IntNo) { 10441 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10442 case Intrinsic::x86_sse2_psll_w: 10443 case Intrinsic::x86_sse2_psll_d: 10444 case Intrinsic::x86_sse2_psll_q: 10445 case Intrinsic::x86_avx2_psll_w: 10446 case Intrinsic::x86_avx2_psll_d: 10447 case Intrinsic::x86_avx2_psll_q: 10448 Opcode = X86ISD::VSHL; 10449 break; 10450 case Intrinsic::x86_sse2_psrl_w: 10451 case Intrinsic::x86_sse2_psrl_d: 10452 case Intrinsic::x86_sse2_psrl_q: 10453 case Intrinsic::x86_avx2_psrl_w: 10454 case Intrinsic::x86_avx2_psrl_d: 10455 case Intrinsic::x86_avx2_psrl_q: 10456 Opcode = X86ISD::VSRL; 10457 break; 10458 case Intrinsic::x86_sse2_psra_w: 10459 case Intrinsic::x86_sse2_psra_d: 10460 case Intrinsic::x86_avx2_psra_w: 10461 case Intrinsic::x86_avx2_psra_d: 10462 Opcode = X86ISD::VSRA; 10463 break; 10464 } 10465 return DAG.getNode(Opcode, dl, Op.getValueType(), 10466 Op.getOperand(1), Op.getOperand(2)); 10467 } 10468 10469 // SSE/AVX immediate shift intrinsics 10470 case Intrinsic::x86_sse2_pslli_w: 10471 case Intrinsic::x86_sse2_pslli_d: 10472 case Intrinsic::x86_sse2_pslli_q: 10473 case Intrinsic::x86_avx2_pslli_w: 10474 case Intrinsic::x86_avx2_pslli_d: 10475 case Intrinsic::x86_avx2_pslli_q: 10476 case Intrinsic::x86_sse2_psrli_w: 10477 case Intrinsic::x86_sse2_psrli_d: 10478 case Intrinsic::x86_sse2_psrli_q: 10479 case Intrinsic::x86_avx2_psrli_w: 10480 case Intrinsic::x86_avx2_psrli_d: 10481 case Intrinsic::x86_avx2_psrli_q: 10482 case Intrinsic::x86_sse2_psrai_w: 10483 case Intrinsic::x86_sse2_psrai_d: 10484 case Intrinsic::x86_avx2_psrai_w: 10485 case Intrinsic::x86_avx2_psrai_d: { 10486 unsigned Opcode; 10487 switch (IntNo) { 10488 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10489 case Intrinsic::x86_sse2_pslli_w: 10490 case Intrinsic::x86_sse2_pslli_d: 10491 case Intrinsic::x86_sse2_pslli_q: 10492 case Intrinsic::x86_avx2_pslli_w: 10493 case Intrinsic::x86_avx2_pslli_d: 10494 case Intrinsic::x86_avx2_pslli_q: 10495 Opcode = X86ISD::VSHLI; 10496 break; 10497 case Intrinsic::x86_sse2_psrli_w: 10498 case Intrinsic::x86_sse2_psrli_d: 10499 case Intrinsic::x86_sse2_psrli_q: 10500 case Intrinsic::x86_avx2_psrli_w: 10501 case Intrinsic::x86_avx2_psrli_d: 10502 case Intrinsic::x86_avx2_psrli_q: 10503 Opcode = X86ISD::VSRLI; 10504 break; 10505 case Intrinsic::x86_sse2_psrai_w: 10506 case Intrinsic::x86_sse2_psrai_d: 10507 case Intrinsic::x86_avx2_psrai_w: 10508 case Intrinsic::x86_avx2_psrai_d: 10509 Opcode = X86ISD::VSRAI; 10510 break; 10511 } 10512 return getTargetVShiftNode(Opcode, dl, Op.getValueType(), 10513 Op.getOperand(1), Op.getOperand(2), DAG); 10514 } 10515 10516 case Intrinsic::x86_sse42_pcmpistria128: 10517 case Intrinsic::x86_sse42_pcmpestria128: 10518 case Intrinsic::x86_sse42_pcmpistric128: 10519 case Intrinsic::x86_sse42_pcmpestric128: 10520 case Intrinsic::x86_sse42_pcmpistrio128: 10521 case Intrinsic::x86_sse42_pcmpestrio128: 10522 case Intrinsic::x86_sse42_pcmpistris128: 10523 case Intrinsic::x86_sse42_pcmpestris128: 10524 case Intrinsic::x86_sse42_pcmpistriz128: 10525 case Intrinsic::x86_sse42_pcmpestriz128: { 10526 unsigned Opcode; 10527 unsigned X86CC; 10528 switch (IntNo) { 10529 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10530 case Intrinsic::x86_sse42_pcmpistria128: 10531 Opcode = X86ISD::PCMPISTRI; 10532 X86CC = X86::COND_A; 10533 break; 10534 case Intrinsic::x86_sse42_pcmpestria128: 10535 Opcode = X86ISD::PCMPESTRI; 10536 X86CC = X86::COND_A; 10537 break; 10538 case Intrinsic::x86_sse42_pcmpistric128: 10539 Opcode = X86ISD::PCMPISTRI; 10540 X86CC = X86::COND_B; 10541 break; 10542 case Intrinsic::x86_sse42_pcmpestric128: 10543 Opcode = X86ISD::PCMPESTRI; 10544 X86CC = X86::COND_B; 10545 break; 10546 case Intrinsic::x86_sse42_pcmpistrio128: 10547 Opcode = X86ISD::PCMPISTRI; 10548 X86CC = X86::COND_O; 10549 break; 10550 case Intrinsic::x86_sse42_pcmpestrio128: 10551 Opcode = X86ISD::PCMPESTRI; 10552 X86CC = X86::COND_O; 10553 break; 10554 case Intrinsic::x86_sse42_pcmpistris128: 10555 Opcode = X86ISD::PCMPISTRI; 10556 X86CC = X86::COND_S; 10557 break; 10558 case Intrinsic::x86_sse42_pcmpestris128: 10559 Opcode = X86ISD::PCMPESTRI; 10560 X86CC = X86::COND_S; 10561 break; 10562 case Intrinsic::x86_sse42_pcmpistriz128: 10563 Opcode = X86ISD::PCMPISTRI; 10564 X86CC = X86::COND_E; 10565 break; 10566 case Intrinsic::x86_sse42_pcmpestriz128: 10567 Opcode = X86ISD::PCMPESTRI; 10568 X86CC = X86::COND_E; 10569 break; 10570 } 10571 SmallVector<SDValue, 5> NewOps; 10572 NewOps.append(Op->op_begin()+1, Op->op_end()); 10573 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 10574 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 10575 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 10576 DAG.getConstant(X86CC, MVT::i8), 10577 SDValue(PCMP.getNode(), 1)); 10578 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 10579 } 10580 10581 case Intrinsic::x86_sse42_pcmpistri128: 10582 case Intrinsic::x86_sse42_pcmpestri128: { 10583 unsigned Opcode; 10584 if (IntNo == Intrinsic::x86_sse42_pcmpistri128) 10585 Opcode = X86ISD::PCMPISTRI; 10586 else 10587 Opcode = X86ISD::PCMPESTRI; 10588 10589 SmallVector<SDValue, 5> NewOps; 10590 NewOps.append(Op->op_begin()+1, Op->op_end()); 10591 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 10592 return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 10593 } 10594 case Intrinsic::x86_fma_vfmadd_ps: 10595 case Intrinsic::x86_fma_vfmadd_pd: 10596 case Intrinsic::x86_fma_vfmsub_ps: 10597 case Intrinsic::x86_fma_vfmsub_pd: 10598 case Intrinsic::x86_fma_vfnmadd_ps: 10599 case Intrinsic::x86_fma_vfnmadd_pd: 10600 case Intrinsic::x86_fma_vfnmsub_ps: 10601 case Intrinsic::x86_fma_vfnmsub_pd: 10602 case Intrinsic::x86_fma_vfmaddsub_ps: 10603 case Intrinsic::x86_fma_vfmaddsub_pd: 10604 case Intrinsic::x86_fma_vfmsubadd_ps: 10605 case Intrinsic::x86_fma_vfmsubadd_pd: 10606 case Intrinsic::x86_fma_vfmadd_ps_256: 10607 case Intrinsic::x86_fma_vfmadd_pd_256: 10608 case Intrinsic::x86_fma_vfmsub_ps_256: 10609 case Intrinsic::x86_fma_vfmsub_pd_256: 10610 case Intrinsic::x86_fma_vfnmadd_ps_256: 10611 case Intrinsic::x86_fma_vfnmadd_pd_256: 10612 case Intrinsic::x86_fma_vfnmsub_ps_256: 10613 case Intrinsic::x86_fma_vfnmsub_pd_256: 10614 case Intrinsic::x86_fma_vfmaddsub_ps_256: 10615 case Intrinsic::x86_fma_vfmaddsub_pd_256: 10616 case Intrinsic::x86_fma_vfmsubadd_ps_256: 10617 case Intrinsic::x86_fma_vfmsubadd_pd_256: { 10618 unsigned Opc; 10619 switch (IntNo) { 10620 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10621 case Intrinsic::x86_fma_vfmadd_ps: 10622 case Intrinsic::x86_fma_vfmadd_pd: 10623 case Intrinsic::x86_fma_vfmadd_ps_256: 10624 case Intrinsic::x86_fma_vfmadd_pd_256: 10625 Opc = X86ISD::FMADD; 10626 break; 10627 case Intrinsic::x86_fma_vfmsub_ps: 10628 case Intrinsic::x86_fma_vfmsub_pd: 10629 case Intrinsic::x86_fma_vfmsub_ps_256: 10630 case Intrinsic::x86_fma_vfmsub_pd_256: 10631 Opc = X86ISD::FMSUB; 10632 break; 10633 case Intrinsic::x86_fma_vfnmadd_ps: 10634 case Intrinsic::x86_fma_vfnmadd_pd: 10635 case Intrinsic::x86_fma_vfnmadd_ps_256: 10636 case Intrinsic::x86_fma_vfnmadd_pd_256: 10637 Opc = X86ISD::FNMADD; 10638 break; 10639 case Intrinsic::x86_fma_vfnmsub_ps: 10640 case Intrinsic::x86_fma_vfnmsub_pd: 10641 case Intrinsic::x86_fma_vfnmsub_ps_256: 10642 case Intrinsic::x86_fma_vfnmsub_pd_256: 10643 Opc = X86ISD::FNMSUB; 10644 break; 10645 case Intrinsic::x86_fma_vfmaddsub_ps: 10646 case Intrinsic::x86_fma_vfmaddsub_pd: 10647 case Intrinsic::x86_fma_vfmaddsub_ps_256: 10648 case Intrinsic::x86_fma_vfmaddsub_pd_256: 10649 Opc = X86ISD::FMADDSUB; 10650 break; 10651 case Intrinsic::x86_fma_vfmsubadd_ps: 10652 case Intrinsic::x86_fma_vfmsubadd_pd: 10653 case Intrinsic::x86_fma_vfmsubadd_ps_256: 10654 case Intrinsic::x86_fma_vfmsubadd_pd_256: 10655 Opc = X86ISD::FMSUBADD; 10656 break; 10657 } 10658 10659 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), 10660 Op.getOperand(2), Op.getOperand(3)); 10661 } 10662 } 10663} 10664 10665static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { 10666 DebugLoc dl = Op.getDebugLoc(); 10667 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10668 switch (IntNo) { 10669 default: return SDValue(); // Don't custom lower most intrinsics. 10670 10671 // RDRAND intrinsics. 10672 case Intrinsic::x86_rdrand_16: 10673 case Intrinsic::x86_rdrand_32: 10674 case Intrinsic::x86_rdrand_64: { 10675 // Emit the node with the right value type. 10676 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); 10677 SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0)); 10678 10679 // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise 10680 // return the value from Rand, which is always 0, casted to i32. 10681 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), 10682 DAG.getConstant(1, Op->getValueType(1)), 10683 DAG.getConstant(X86::COND_B, MVT::i32), 10684 SDValue(Result.getNode(), 1) }; 10685 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, 10686 DAG.getVTList(Op->getValueType(1), MVT::Glue), 10687 Ops, 4); 10688 10689 // Return { result, isValid, chain }. 10690 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, 10691 SDValue(Result.getNode(), 2)); 10692 } 10693 } 10694} 10695 10696SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 10697 SelectionDAG &DAG) const { 10698 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 10699 MFI->setReturnAddressIsTaken(true); 10700 10701 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 10702 DebugLoc dl = Op.getDebugLoc(); 10703 EVT PtrVT = getPointerTy(); 10704 10705 if (Depth > 0) { 10706 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 10707 SDValue Offset = 10708 DAG.getConstant(RegInfo->getSlotSize(), PtrVT); 10709 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 10710 DAG.getNode(ISD::ADD, dl, PtrVT, 10711 FrameAddr, Offset), 10712 MachinePointerInfo(), false, false, false, 0); 10713 } 10714 10715 // Just load the return address. 10716 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 10717 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 10718 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 10719} 10720 10721SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 10722 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 10723 MFI->setFrameAddressIsTaken(true); 10724 10725 EVT VT = Op.getValueType(); 10726 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 10727 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 10728 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 10729 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 10730 while (Depth--) 10731 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 10732 MachinePointerInfo(), 10733 false, false, false, 0); 10734 return FrameAddr; 10735} 10736 10737SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 10738 SelectionDAG &DAG) const { 10739 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); 10740} 10741 10742SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 10743 SDValue Chain = Op.getOperand(0); 10744 SDValue Offset = Op.getOperand(1); 10745 SDValue Handler = Op.getOperand(2); 10746 DebugLoc dl = Op.getDebugLoc(); 10747 10748 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 10749 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 10750 getPointerTy()); 10751 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 10752 10753 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 10754 DAG.getIntPtrConstant(RegInfo->getSlotSize())); 10755 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 10756 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 10757 false, false, 0); 10758 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 10759 10760 return DAG.getNode(X86ISD::EH_RETURN, dl, 10761 MVT::Other, 10762 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 10763} 10764 10765SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 10766 SelectionDAG &DAG) const { 10767 DebugLoc DL = Op.getDebugLoc(); 10768 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, 10769 DAG.getVTList(MVT::i32, MVT::Other), 10770 Op.getOperand(0), Op.getOperand(1)); 10771} 10772 10773SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 10774 SelectionDAG &DAG) const { 10775 DebugLoc DL = Op.getDebugLoc(); 10776 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 10777 Op.getOperand(0), Op.getOperand(1)); 10778} 10779 10780static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { 10781 return Op.getOperand(0); 10782} 10783 10784SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 10785 SelectionDAG &DAG) const { 10786 SDValue Root = Op.getOperand(0); 10787 SDValue Trmp = Op.getOperand(1); // trampoline 10788 SDValue FPtr = Op.getOperand(2); // nested function 10789 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 10790 DebugLoc dl = Op.getDebugLoc(); 10791 10792 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 10793 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 10794 10795 if (Subtarget->is64Bit()) { 10796 SDValue OutChains[6]; 10797 10798 // Large code-model. 10799 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 10800 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 10801 10802 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; 10803 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; 10804 10805 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 10806 10807 // Load the pointer to the nested function into R11. 10808 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 10809 SDValue Addr = Trmp; 10810 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 10811 Addr, MachinePointerInfo(TrmpAddr), 10812 false, false, 0); 10813 10814 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10815 DAG.getConstant(2, MVT::i64)); 10816 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 10817 MachinePointerInfo(TrmpAddr, 2), 10818 false, false, 2); 10819 10820 // Load the 'nest' parameter value into R10. 10821 // R10 is specified in X86CallingConv.td 10822 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 10823 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10824 DAG.getConstant(10, MVT::i64)); 10825 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 10826 Addr, MachinePointerInfo(TrmpAddr, 10), 10827 false, false, 0); 10828 10829 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10830 DAG.getConstant(12, MVT::i64)); 10831 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 10832 MachinePointerInfo(TrmpAddr, 12), 10833 false, false, 2); 10834 10835 // Jump to the nested function. 10836 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 10837 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10838 DAG.getConstant(20, MVT::i64)); 10839 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 10840 Addr, MachinePointerInfo(TrmpAddr, 20), 10841 false, false, 0); 10842 10843 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 10844 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10845 DAG.getConstant(22, MVT::i64)); 10846 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 10847 MachinePointerInfo(TrmpAddr, 22), 10848 false, false, 0); 10849 10850 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); 10851 } else { 10852 const Function *Func = 10853 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 10854 CallingConv::ID CC = Func->getCallingConv(); 10855 unsigned NestReg; 10856 10857 switch (CC) { 10858 default: 10859 llvm_unreachable("Unsupported calling convention"); 10860 case CallingConv::C: 10861 case CallingConv::X86_StdCall: { 10862 // Pass 'nest' parameter in ECX. 10863 // Must be kept in sync with X86CallingConv.td 10864 NestReg = X86::ECX; 10865 10866 // Check that ECX wasn't needed by an 'inreg' parameter. 10867 FunctionType *FTy = Func->getFunctionType(); 10868 const AttributeSet &Attrs = Func->getAttributes(); 10869 10870 if (!Attrs.isEmpty() && !Func->isVarArg()) { 10871 unsigned InRegCount = 0; 10872 unsigned Idx = 1; 10873 10874 for (FunctionType::param_iterator I = FTy->param_begin(), 10875 E = FTy->param_end(); I != E; ++I, ++Idx) 10876 if (Attrs.getParamAttributes(Idx).hasAttribute(Attribute::InReg)) 10877 // FIXME: should only count parameters that are lowered to integers. 10878 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 10879 10880 if (InRegCount > 2) { 10881 report_fatal_error("Nest register in use - reduce number of inreg" 10882 " parameters!"); 10883 } 10884 } 10885 break; 10886 } 10887 case CallingConv::X86_FastCall: 10888 case CallingConv::X86_ThisCall: 10889 case CallingConv::Fast: 10890 // Pass 'nest' parameter in EAX. 10891 // Must be kept in sync with X86CallingConv.td 10892 NestReg = X86::EAX; 10893 break; 10894 } 10895 10896 SDValue OutChains[4]; 10897 SDValue Addr, Disp; 10898 10899 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10900 DAG.getConstant(10, MVT::i32)); 10901 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 10902 10903 // This is storing the opcode for MOV32ri. 10904 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 10905 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; 10906 OutChains[0] = DAG.getStore(Root, dl, 10907 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 10908 Trmp, MachinePointerInfo(TrmpAddr), 10909 false, false, 0); 10910 10911 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10912 DAG.getConstant(1, MVT::i32)); 10913 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 10914 MachinePointerInfo(TrmpAddr, 1), 10915 false, false, 1); 10916 10917 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 10918 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10919 DAG.getConstant(5, MVT::i32)); 10920 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 10921 MachinePointerInfo(TrmpAddr, 5), 10922 false, false, 1); 10923 10924 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10925 DAG.getConstant(6, MVT::i32)); 10926 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 10927 MachinePointerInfo(TrmpAddr, 6), 10928 false, false, 1); 10929 10930 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); 10931 } 10932} 10933 10934SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 10935 SelectionDAG &DAG) const { 10936 /* 10937 The rounding mode is in bits 11:10 of FPSR, and has the following 10938 settings: 10939 00 Round to nearest 10940 01 Round to -inf 10941 10 Round to +inf 10942 11 Round to 0 10943 10944 FLT_ROUNDS, on the other hand, expects the following: 10945 -1 Undefined 10946 0 Round to 0 10947 1 Round to nearest 10948 2 Round to +inf 10949 3 Round to -inf 10950 10951 To perform the conversion, we do: 10952 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 10953 */ 10954 10955 MachineFunction &MF = DAG.getMachineFunction(); 10956 const TargetMachine &TM = MF.getTarget(); 10957 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 10958 unsigned StackAlignment = TFI.getStackAlignment(); 10959 EVT VT = Op.getValueType(); 10960 DebugLoc DL = Op.getDebugLoc(); 10961 10962 // Save FP Control Word to stack slot 10963 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 10964 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 10965 10966 MachineMemOperand *MMO = 10967 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 10968 MachineMemOperand::MOStore, 2, 2); 10969 10970 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 10971 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 10972 DAG.getVTList(MVT::Other), 10973 Ops, 2, MVT::i16, MMO); 10974 10975 // Load FP Control Word from stack slot 10976 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 10977 MachinePointerInfo(), false, false, false, 0); 10978 10979 // Transform as necessary 10980 SDValue CWD1 = 10981 DAG.getNode(ISD::SRL, DL, MVT::i16, 10982 DAG.getNode(ISD::AND, DL, MVT::i16, 10983 CWD, DAG.getConstant(0x800, MVT::i16)), 10984 DAG.getConstant(11, MVT::i8)); 10985 SDValue CWD2 = 10986 DAG.getNode(ISD::SRL, DL, MVT::i16, 10987 DAG.getNode(ISD::AND, DL, MVT::i16, 10988 CWD, DAG.getConstant(0x400, MVT::i16)), 10989 DAG.getConstant(9, MVT::i8)); 10990 10991 SDValue RetVal = 10992 DAG.getNode(ISD::AND, DL, MVT::i16, 10993 DAG.getNode(ISD::ADD, DL, MVT::i16, 10994 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 10995 DAG.getConstant(1, MVT::i16)), 10996 DAG.getConstant(3, MVT::i16)); 10997 10998 return DAG.getNode((VT.getSizeInBits() < 16 ? 10999 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 11000} 11001 11002static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 11003 EVT VT = Op.getValueType(); 11004 EVT OpVT = VT; 11005 unsigned NumBits = VT.getSizeInBits(); 11006 DebugLoc dl = Op.getDebugLoc(); 11007 11008 Op = Op.getOperand(0); 11009 if (VT == MVT::i8) { 11010 // Zero extend to i32 since there is not an i8 bsr. 11011 OpVT = MVT::i32; 11012 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 11013 } 11014 11015 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 11016 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 11017 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 11018 11019 // If src is zero (i.e. bsr sets ZF), returns NumBits. 11020 SDValue Ops[] = { 11021 Op, 11022 DAG.getConstant(NumBits+NumBits-1, OpVT), 11023 DAG.getConstant(X86::COND_E, MVT::i8), 11024 Op.getValue(1) 11025 }; 11026 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 11027 11028 // Finally xor with NumBits-1. 11029 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 11030 11031 if (VT == MVT::i8) 11032 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 11033 return Op; 11034} 11035 11036static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { 11037 EVT VT = Op.getValueType(); 11038 EVT OpVT = VT; 11039 unsigned NumBits = VT.getSizeInBits(); 11040 DebugLoc dl = Op.getDebugLoc(); 11041 11042 Op = Op.getOperand(0); 11043 if (VT == MVT::i8) { 11044 // Zero extend to i32 since there is not an i8 bsr. 11045 OpVT = MVT::i32; 11046 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 11047 } 11048 11049 // Issue a bsr (scan bits in reverse). 11050 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 11051 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 11052 11053 // And xor with NumBits-1. 11054 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 11055 11056 if (VT == MVT::i8) 11057 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 11058 return Op; 11059} 11060 11061static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 11062 EVT VT = Op.getValueType(); 11063 unsigned NumBits = VT.getSizeInBits(); 11064 DebugLoc dl = Op.getDebugLoc(); 11065 Op = Op.getOperand(0); 11066 11067 // Issue a bsf (scan bits forward) which also sets EFLAGS. 11068 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 11069 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 11070 11071 // If src is zero (i.e. bsf sets ZF), returns NumBits. 11072 SDValue Ops[] = { 11073 Op, 11074 DAG.getConstant(NumBits, VT), 11075 DAG.getConstant(X86::COND_E, MVT::i8), 11076 Op.getValue(1) 11077 }; 11078 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); 11079} 11080 11081// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 11082// ones, and then concatenate the result back. 11083static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 11084 EVT VT = Op.getValueType(); 11085 11086 assert(VT.is256BitVector() && VT.isInteger() && 11087 "Unsupported value type for operation"); 11088 11089 unsigned NumElems = VT.getVectorNumElements(); 11090 DebugLoc dl = Op.getDebugLoc(); 11091 11092 // Extract the LHS vectors 11093 SDValue LHS = Op.getOperand(0); 11094 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 11095 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 11096 11097 // Extract the RHS vectors 11098 SDValue RHS = Op.getOperand(1); 11099 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 11100 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 11101 11102 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 11103 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 11104 11105 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 11106 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 11107 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 11108} 11109 11110static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { 11111 assert(Op.getValueType().is256BitVector() && 11112 Op.getValueType().isInteger() && 11113 "Only handle AVX 256-bit vector integer operation"); 11114 return Lower256IntArith(Op, DAG); 11115} 11116 11117static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { 11118 assert(Op.getValueType().is256BitVector() && 11119 Op.getValueType().isInteger() && 11120 "Only handle AVX 256-bit vector integer operation"); 11121 return Lower256IntArith(Op, DAG); 11122} 11123 11124static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, 11125 SelectionDAG &DAG) { 11126 DebugLoc dl = Op.getDebugLoc(); 11127 EVT VT = Op.getValueType(); 11128 11129 // Decompose 256-bit ops into smaller 128-bit ops. 11130 if (VT.is256BitVector() && !Subtarget->hasInt256()) 11131 return Lower256IntArith(Op, DAG); 11132 11133 SDValue A = Op.getOperand(0); 11134 SDValue B = Op.getOperand(1); 11135 11136 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. 11137 if (VT == MVT::v4i32) { 11138 assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && 11139 "Should not custom lower when pmuldq is available!"); 11140 11141 // Extract the odd parts. 11142 const int UnpackMask[] = { 1, -1, 3, -1 }; 11143 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); 11144 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); 11145 11146 // Multiply the even parts. 11147 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); 11148 // Now multiply odd parts. 11149 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); 11150 11151 Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); 11152 Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); 11153 11154 // Merge the two vectors back together with a shuffle. This expands into 2 11155 // shuffles. 11156 const int ShufMask[] = { 0, 4, 2, 6 }; 11157 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); 11158 } 11159 11160 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && 11161 "Only know how to lower V2I64/V4I64 multiply"); 11162 11163 // Ahi = psrlqi(a, 32); 11164 // Bhi = psrlqi(b, 32); 11165 // 11166 // AloBlo = pmuludq(a, b); 11167 // AloBhi = pmuludq(a, Bhi); 11168 // AhiBlo = pmuludq(Ahi, b); 11169 11170 // AloBhi = psllqi(AloBhi, 32); 11171 // AhiBlo = psllqi(AhiBlo, 32); 11172 // return AloBlo + AloBhi + AhiBlo; 11173 11174 SDValue ShAmt = DAG.getConstant(32, MVT::i32); 11175 11176 SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); 11177 SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); 11178 11179 // Bit cast to 32-bit vectors for MULUDQ 11180 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; 11181 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); 11182 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); 11183 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); 11184 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); 11185 11186 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 11187 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 11188 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 11189 11190 AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); 11191 AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); 11192 11193 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 11194 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 11195} 11196 11197SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 11198 11199 EVT VT = Op.getValueType(); 11200 DebugLoc dl = Op.getDebugLoc(); 11201 SDValue R = Op.getOperand(0); 11202 SDValue Amt = Op.getOperand(1); 11203 LLVMContext *Context = DAG.getContext(); 11204 11205 if (!Subtarget->hasSSE2()) 11206 return SDValue(); 11207 11208 // Optimize shl/srl/sra with constant shift amount. 11209 if (isSplatVector(Amt.getNode())) { 11210 SDValue SclrAmt = Amt->getOperand(0); 11211 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 11212 uint64_t ShiftAmt = C->getZExtValue(); 11213 11214 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 11215 (Subtarget->hasInt256() && 11216 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { 11217 if (Op.getOpcode() == ISD::SHL) 11218 return DAG.getNode(X86ISD::VSHLI, dl, VT, R, 11219 DAG.getConstant(ShiftAmt, MVT::i32)); 11220 if (Op.getOpcode() == ISD::SRL) 11221 return DAG.getNode(X86ISD::VSRLI, dl, VT, R, 11222 DAG.getConstant(ShiftAmt, MVT::i32)); 11223 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) 11224 return DAG.getNode(X86ISD::VSRAI, dl, VT, R, 11225 DAG.getConstant(ShiftAmt, MVT::i32)); 11226 } 11227 11228 if (VT == MVT::v16i8) { 11229 if (Op.getOpcode() == ISD::SHL) { 11230 // Make a large shift. 11231 SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R, 11232 DAG.getConstant(ShiftAmt, MVT::i32)); 11233 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 11234 // Zero out the rightmost bits. 11235 SmallVector<SDValue, 16> V(16, 11236 DAG.getConstant(uint8_t(-1U << ShiftAmt), 11237 MVT::i8)); 11238 return DAG.getNode(ISD::AND, dl, VT, SHL, 11239 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 11240 } 11241 if (Op.getOpcode() == ISD::SRL) { 11242 // Make a large shift. 11243 SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R, 11244 DAG.getConstant(ShiftAmt, MVT::i32)); 11245 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 11246 // Zero out the leftmost bits. 11247 SmallVector<SDValue, 16> V(16, 11248 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 11249 MVT::i8)); 11250 return DAG.getNode(ISD::AND, dl, VT, SRL, 11251 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 11252 } 11253 if (Op.getOpcode() == ISD::SRA) { 11254 if (ShiftAmt == 7) { 11255 // R s>> 7 === R s< 0 11256 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 11257 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 11258 } 11259 11260 // R s>> a === ((R u>> a) ^ m) - m 11261 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 11262 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, 11263 MVT::i8)); 11264 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); 11265 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 11266 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 11267 return Res; 11268 } 11269 llvm_unreachable("Unknown shift opcode."); 11270 } 11271 11272 if (Subtarget->hasInt256() && VT == MVT::v32i8) { 11273 if (Op.getOpcode() == ISD::SHL) { 11274 // Make a large shift. 11275 SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, 11276 DAG.getConstant(ShiftAmt, MVT::i32)); 11277 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 11278 // Zero out the rightmost bits. 11279 SmallVector<SDValue, 32> V(32, 11280 DAG.getConstant(uint8_t(-1U << ShiftAmt), 11281 MVT::i8)); 11282 return DAG.getNode(ISD::AND, dl, VT, SHL, 11283 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 11284 } 11285 if (Op.getOpcode() == ISD::SRL) { 11286 // Make a large shift. 11287 SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R, 11288 DAG.getConstant(ShiftAmt, MVT::i32)); 11289 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 11290 // Zero out the leftmost bits. 11291 SmallVector<SDValue, 32> V(32, 11292 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 11293 MVT::i8)); 11294 return DAG.getNode(ISD::AND, dl, VT, SRL, 11295 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 11296 } 11297 if (Op.getOpcode() == ISD::SRA) { 11298 if (ShiftAmt == 7) { 11299 // R s>> 7 === R s< 0 11300 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 11301 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 11302 } 11303 11304 // R s>> a === ((R u>> a) ^ m) - m 11305 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 11306 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, 11307 MVT::i8)); 11308 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); 11309 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 11310 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 11311 return Res; 11312 } 11313 llvm_unreachable("Unknown shift opcode."); 11314 } 11315 } 11316 } 11317 11318 // Lower SHL with variable shift amount. 11319 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 11320 Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1), 11321 DAG.getConstant(23, MVT::i32)); 11322 11323 const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U}; 11324 Constant *C = ConstantDataVector::get(*Context, CV); 11325 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 11326 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 11327 MachinePointerInfo::getConstantPool(), 11328 false, false, false, 16); 11329 11330 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 11331 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 11332 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 11333 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 11334 } 11335 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 11336 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); 11337 11338 // a = a << 5; 11339 Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1), 11340 DAG.getConstant(5, MVT::i32)); 11341 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); 11342 11343 // Turn 'a' into a mask suitable for VSELECT 11344 SDValue VSelM = DAG.getConstant(0x80, VT); 11345 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 11346 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 11347 11348 SDValue CM1 = DAG.getConstant(0x0f, VT); 11349 SDValue CM2 = DAG.getConstant(0x3f, VT); 11350 11351 // r = VSELECT(r, psllw(r & (char16)15, 4), a); 11352 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); 11353 M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 11354 DAG.getConstant(4, MVT::i32), DAG); 11355 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 11356 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 11357 11358 // a += a 11359 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 11360 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 11361 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 11362 11363 // r = VSELECT(r, psllw(r & (char16)63, 2), a); 11364 M = DAG.getNode(ISD::AND, dl, VT, R, CM2); 11365 M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 11366 DAG.getConstant(2, MVT::i32), DAG); 11367 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 11368 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 11369 11370 // a += a 11371 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 11372 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 11373 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 11374 11375 // return VSELECT(r, r+r, a); 11376 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, 11377 DAG.getNode(ISD::ADD, dl, VT, R, R), R); 11378 return R; 11379 } 11380 11381 // Decompose 256-bit shifts into smaller 128-bit shifts. 11382 if (VT.is256BitVector()) { 11383 unsigned NumElems = VT.getVectorNumElements(); 11384 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 11385 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 11386 11387 // Extract the two vectors 11388 SDValue V1 = Extract128BitVector(R, 0, DAG, dl); 11389 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); 11390 11391 // Recreate the shift amount vectors 11392 SDValue Amt1, Amt2; 11393 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 11394 // Constant shift amount 11395 SmallVector<SDValue, 4> Amt1Csts; 11396 SmallVector<SDValue, 4> Amt2Csts; 11397 for (unsigned i = 0; i != NumElems/2; ++i) 11398 Amt1Csts.push_back(Amt->getOperand(i)); 11399 for (unsigned i = NumElems/2; i != NumElems; ++i) 11400 Amt2Csts.push_back(Amt->getOperand(i)); 11401 11402 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 11403 &Amt1Csts[0], NumElems/2); 11404 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 11405 &Amt2Csts[0], NumElems/2); 11406 } else { 11407 // Variable shift amount 11408 Amt1 = Extract128BitVector(Amt, 0, DAG, dl); 11409 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); 11410 } 11411 11412 // Issue new vector shifts for the smaller types 11413 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 11414 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 11415 11416 // Concatenate the result back 11417 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 11418 } 11419 11420 return SDValue(); 11421} 11422 11423static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 11424 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 11425 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 11426 // looks for this combo and may remove the "setcc" instruction if the "setcc" 11427 // has only one use. 11428 SDNode *N = Op.getNode(); 11429 SDValue LHS = N->getOperand(0); 11430 SDValue RHS = N->getOperand(1); 11431 unsigned BaseOp = 0; 11432 unsigned Cond = 0; 11433 DebugLoc DL = Op.getDebugLoc(); 11434 switch (Op.getOpcode()) { 11435 default: llvm_unreachable("Unknown ovf instruction!"); 11436 case ISD::SADDO: 11437 // A subtract of one will be selected as a INC. Note that INC doesn't 11438 // set CF, so we can't do this for UADDO. 11439 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 11440 if (C->isOne()) { 11441 BaseOp = X86ISD::INC; 11442 Cond = X86::COND_O; 11443 break; 11444 } 11445 BaseOp = X86ISD::ADD; 11446 Cond = X86::COND_O; 11447 break; 11448 case ISD::UADDO: 11449 BaseOp = X86ISD::ADD; 11450 Cond = X86::COND_B; 11451 break; 11452 case ISD::SSUBO: 11453 // A subtract of one will be selected as a DEC. Note that DEC doesn't 11454 // set CF, so we can't do this for USUBO. 11455 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 11456 if (C->isOne()) { 11457 BaseOp = X86ISD::DEC; 11458 Cond = X86::COND_O; 11459 break; 11460 } 11461 BaseOp = X86ISD::SUB; 11462 Cond = X86::COND_O; 11463 break; 11464 case ISD::USUBO: 11465 BaseOp = X86ISD::SUB; 11466 Cond = X86::COND_B; 11467 break; 11468 case ISD::SMULO: 11469 BaseOp = X86ISD::SMUL; 11470 Cond = X86::COND_O; 11471 break; 11472 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 11473 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 11474 MVT::i32); 11475 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 11476 11477 SDValue SetCC = 11478 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11479 DAG.getConstant(X86::COND_O, MVT::i32), 11480 SDValue(Sum.getNode(), 2)); 11481 11482 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 11483 } 11484 } 11485 11486 // Also sets EFLAGS. 11487 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 11488 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 11489 11490 SDValue SetCC = 11491 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 11492 DAG.getConstant(Cond, MVT::i32), 11493 SDValue(Sum.getNode(), 1)); 11494 11495 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 11496} 11497 11498SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 11499 SelectionDAG &DAG) const { 11500 DebugLoc dl = Op.getDebugLoc(); 11501 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 11502 EVT VT = Op.getValueType(); 11503 11504 if (!Subtarget->hasSSE2() || !VT.isVector()) 11505 return SDValue(); 11506 11507 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 11508 ExtraVT.getScalarType().getSizeInBits(); 11509 SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); 11510 11511 switch (VT.getSimpleVT().SimpleTy) { 11512 default: return SDValue(); 11513 case MVT::v8i32: 11514 case MVT::v16i16: 11515 if (!Subtarget->hasFp256()) 11516 return SDValue(); 11517 if (!Subtarget->hasInt256()) { 11518 // needs to be split 11519 unsigned NumElems = VT.getVectorNumElements(); 11520 11521 // Extract the LHS vectors 11522 SDValue LHS = Op.getOperand(0); 11523 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 11524 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 11525 11526 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 11527 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 11528 11529 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 11530 unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); 11531 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, 11532 ExtraNumElems/2); 11533 SDValue Extra = DAG.getValueType(ExtraVT); 11534 11535 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); 11536 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); 11537 11538 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); 11539 } 11540 // fall through 11541 case MVT::v4i32: 11542 case MVT::v8i16: { 11543 SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, 11544 Op.getOperand(0), ShAmt, DAG); 11545 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); 11546 } 11547 } 11548} 11549 11550static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget, 11551 SelectionDAG &DAG) { 11552 DebugLoc dl = Op.getDebugLoc(); 11553 11554 // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. 11555 // There isn't any reason to disable it if the target processor supports it. 11556 if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { 11557 SDValue Chain = Op.getOperand(0); 11558 SDValue Zero = DAG.getConstant(0, MVT::i32); 11559 SDValue Ops[] = { 11560 DAG.getRegister(X86::ESP, MVT::i32), // Base 11561 DAG.getTargetConstant(1, MVT::i8), // Scale 11562 DAG.getRegister(0, MVT::i32), // Index 11563 DAG.getTargetConstant(0, MVT::i32), // Disp 11564 DAG.getRegister(0, MVT::i32), // Segment. 11565 Zero, 11566 Chain 11567 }; 11568 SDNode *Res = 11569 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 11570 array_lengthof(Ops)); 11571 return SDValue(Res, 0); 11572 } 11573 11574 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 11575 if (!isDev) 11576 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 11577 11578 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 11579 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 11580 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 11581 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 11582 11583 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 11584 if (!Op1 && !Op2 && !Op3 && Op4) 11585 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 11586 11587 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 11588 if (Op1 && !Op2 && !Op3 && !Op4) 11589 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 11590 11591 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 11592 // (MFENCE)>; 11593 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 11594} 11595 11596static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, 11597 SelectionDAG &DAG) { 11598 DebugLoc dl = Op.getDebugLoc(); 11599 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 11600 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 11601 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 11602 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 11603 11604 // The only fence that needs an instruction is a sequentially-consistent 11605 // cross-thread fence. 11606 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 11607 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 11608 // no-sse2). There isn't any reason to disable it if the target processor 11609 // supports it. 11610 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 11611 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 11612 11613 SDValue Chain = Op.getOperand(0); 11614 SDValue Zero = DAG.getConstant(0, MVT::i32); 11615 SDValue Ops[] = { 11616 DAG.getRegister(X86::ESP, MVT::i32), // Base 11617 DAG.getTargetConstant(1, MVT::i8), // Scale 11618 DAG.getRegister(0, MVT::i32), // Index 11619 DAG.getTargetConstant(0, MVT::i32), // Disp 11620 DAG.getRegister(0, MVT::i32), // Segment. 11621 Zero, 11622 Chain 11623 }; 11624 SDNode *Res = 11625 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 11626 array_lengthof(Ops)); 11627 return SDValue(Res, 0); 11628 } 11629 11630 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 11631 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 11632} 11633 11634static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, 11635 SelectionDAG &DAG) { 11636 EVT T = Op.getValueType(); 11637 DebugLoc DL = Op.getDebugLoc(); 11638 unsigned Reg = 0; 11639 unsigned size = 0; 11640 switch(T.getSimpleVT().SimpleTy) { 11641 default: llvm_unreachable("Invalid value type!"); 11642 case MVT::i8: Reg = X86::AL; size = 1; break; 11643 case MVT::i16: Reg = X86::AX; size = 2; break; 11644 case MVT::i32: Reg = X86::EAX; size = 4; break; 11645 case MVT::i64: 11646 assert(Subtarget->is64Bit() && "Node not type legal!"); 11647 Reg = X86::RAX; size = 8; 11648 break; 11649 } 11650 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 11651 Op.getOperand(2), SDValue()); 11652 SDValue Ops[] = { cpIn.getValue(0), 11653 Op.getOperand(1), 11654 Op.getOperand(3), 11655 DAG.getTargetConstant(size, MVT::i8), 11656 cpIn.getValue(1) }; 11657 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11658 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 11659 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 11660 Ops, 5, T, MMO); 11661 SDValue cpOut = 11662 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 11663 return cpOut; 11664} 11665 11666static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, 11667 SelectionDAG &DAG) { 11668 assert(Subtarget->is64Bit() && "Result not type legalized?"); 11669 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11670 SDValue TheChain = Op.getOperand(0); 11671 DebugLoc dl = Op.getDebugLoc(); 11672 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 11673 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 11674 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 11675 rax.getValue(2)); 11676 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 11677 DAG.getConstant(32, MVT::i8)); 11678 SDValue Ops[] = { 11679 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 11680 rdx.getValue(1) 11681 }; 11682 return DAG.getMergeValues(Ops, 2, dl); 11683} 11684 11685SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { 11686 EVT SrcVT = Op.getOperand(0).getValueType(); 11687 EVT DstVT = Op.getValueType(); 11688 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 11689 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 11690 assert((DstVT == MVT::i64 || 11691 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 11692 "Unexpected custom BITCAST"); 11693 // i64 <=> MMX conversions are Legal. 11694 if (SrcVT==MVT::i64 && DstVT.isVector()) 11695 return Op; 11696 if (DstVT==MVT::i64 && SrcVT.isVector()) 11697 return Op; 11698 // MMX <=> MMX conversions are Legal. 11699 if (SrcVT.isVector() && DstVT.isVector()) 11700 return Op; 11701 // All other conversions need to be expanded. 11702 return SDValue(); 11703} 11704 11705static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 11706 SDNode *Node = Op.getNode(); 11707 DebugLoc dl = Node->getDebugLoc(); 11708 EVT T = Node->getValueType(0); 11709 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 11710 DAG.getConstant(0, T), Node->getOperand(2)); 11711 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 11712 cast<AtomicSDNode>(Node)->getMemoryVT(), 11713 Node->getOperand(0), 11714 Node->getOperand(1), negOp, 11715 cast<AtomicSDNode>(Node)->getSrcValue(), 11716 cast<AtomicSDNode>(Node)->getAlignment(), 11717 cast<AtomicSDNode>(Node)->getOrdering(), 11718 cast<AtomicSDNode>(Node)->getSynchScope()); 11719} 11720 11721static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 11722 SDNode *Node = Op.getNode(); 11723 DebugLoc dl = Node->getDebugLoc(); 11724 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 11725 11726 // Convert seq_cst store -> xchg 11727 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 11728 // FIXME: On 32-bit, store -> fist or movq would be more efficient 11729 // (The only way to get a 16-byte store is cmpxchg16b) 11730 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 11731 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 11732 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 11733 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 11734 cast<AtomicSDNode>(Node)->getMemoryVT(), 11735 Node->getOperand(0), 11736 Node->getOperand(1), Node->getOperand(2), 11737 cast<AtomicSDNode>(Node)->getMemOperand(), 11738 cast<AtomicSDNode>(Node)->getOrdering(), 11739 cast<AtomicSDNode>(Node)->getSynchScope()); 11740 return Swap.getValue(1); 11741 } 11742 // Other atomic stores have a simple pattern. 11743 return Op; 11744} 11745 11746static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 11747 EVT VT = Op.getNode()->getValueType(0); 11748 11749 // Let legalize expand this if it isn't a legal type yet. 11750 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11751 return SDValue(); 11752 11753 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 11754 11755 unsigned Opc; 11756 bool ExtraOp = false; 11757 switch (Op.getOpcode()) { 11758 default: llvm_unreachable("Invalid code"); 11759 case ISD::ADDC: Opc = X86ISD::ADD; break; 11760 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 11761 case ISD::SUBC: Opc = X86ISD::SUB; break; 11762 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 11763 } 11764 11765 if (!ExtraOp) 11766 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 11767 Op.getOperand(1)); 11768 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 11769 Op.getOperand(1), Op.getOperand(2)); 11770} 11771 11772/// LowerOperation - Provide custom lowering hooks for some operations. 11773/// 11774SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 11775 switch (Op.getOpcode()) { 11776 default: llvm_unreachable("Should not custom lower this!"); 11777 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 11778 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, Subtarget, DAG); 11779 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); 11780 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); 11781 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 11782 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 11783 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 11784 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 11785 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 11786 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 11787 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 11788 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); 11789 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); 11790 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 11791 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 11792 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 11793 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 11794 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 11795 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 11796 case ISD::SHL_PARTS: 11797 case ISD::SRA_PARTS: 11798 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 11799 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 11800 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 11801 case ISD::TRUNCATE: return lowerTRUNCATE(Op, DAG); 11802 case ISD::ZERO_EXTEND: return lowerZERO_EXTEND(Op, DAG); 11803 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 11804 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 11805 case ISD::FP_EXTEND: return lowerFP_EXTEND(Op, DAG); 11806 case ISD::FABS: return LowerFABS(Op, DAG); 11807 case ISD::FNEG: return LowerFNEG(Op, DAG); 11808 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 11809 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 11810 case ISD::SETCC: return LowerSETCC(Op, DAG); 11811 case ISD::SELECT: return LowerSELECT(Op, DAG); 11812 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 11813 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 11814 case ISD::VASTART: return LowerVASTART(Op, DAG); 11815 case ISD::VAARG: return LowerVAARG(Op, DAG); 11816 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); 11817 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 11818 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); 11819 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 11820 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 11821 case ISD::FRAME_TO_ARGS_OFFSET: 11822 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 11823 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 11824 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 11825 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 11826 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 11827 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 11828 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 11829 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 11830 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 11831 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); 11832 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 11833 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); 11834 case ISD::SRA: 11835 case ISD::SRL: 11836 case ISD::SHL: return LowerShift(Op, DAG); 11837 case ISD::SADDO: 11838 case ISD::UADDO: 11839 case ISD::SSUBO: 11840 case ISD::USUBO: 11841 case ISD::SMULO: 11842 case ISD::UMULO: return LowerXALUO(Op, DAG); 11843 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); 11844 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 11845 case ISD::ADDC: 11846 case ISD::ADDE: 11847 case ISD::SUBC: 11848 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 11849 case ISD::ADD: return LowerADD(Op, DAG); 11850 case ISD::SUB: return LowerSUB(Op, DAG); 11851 } 11852} 11853 11854static void ReplaceATOMIC_LOAD(SDNode *Node, 11855 SmallVectorImpl<SDValue> &Results, 11856 SelectionDAG &DAG) { 11857 DebugLoc dl = Node->getDebugLoc(); 11858 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 11859 11860 // Convert wide load -> cmpxchg8b/cmpxchg16b 11861 // FIXME: On 32-bit, load -> fild or movq would be more efficient 11862 // (The only way to get a 16-byte load is cmpxchg16b) 11863 // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. 11864 SDValue Zero = DAG.getConstant(0, VT); 11865 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, 11866 Node->getOperand(0), 11867 Node->getOperand(1), Zero, Zero, 11868 cast<AtomicSDNode>(Node)->getMemOperand(), 11869 cast<AtomicSDNode>(Node)->getOrdering(), 11870 cast<AtomicSDNode>(Node)->getSynchScope()); 11871 Results.push_back(Swap.getValue(0)); 11872 Results.push_back(Swap.getValue(1)); 11873} 11874 11875static void 11876ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 11877 SelectionDAG &DAG, unsigned NewOp) { 11878 DebugLoc dl = Node->getDebugLoc(); 11879 assert (Node->getValueType(0) == MVT::i64 && 11880 "Only know how to expand i64 atomics"); 11881 11882 SDValue Chain = Node->getOperand(0); 11883 SDValue In1 = Node->getOperand(1); 11884 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 11885 Node->getOperand(2), DAG.getIntPtrConstant(0)); 11886 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 11887 Node->getOperand(2), DAG.getIntPtrConstant(1)); 11888 SDValue Ops[] = { Chain, In1, In2L, In2H }; 11889 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 11890 SDValue Result = 11891 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 11892 cast<MemSDNode>(Node)->getMemOperand()); 11893 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 11894 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 11895 Results.push_back(Result.getValue(2)); 11896} 11897 11898/// ReplaceNodeResults - Replace a node with an illegal result type 11899/// with a new node built out of custom code. 11900void X86TargetLowering::ReplaceNodeResults(SDNode *N, 11901 SmallVectorImpl<SDValue>&Results, 11902 SelectionDAG &DAG) const { 11903 DebugLoc dl = N->getDebugLoc(); 11904 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11905 switch (N->getOpcode()) { 11906 default: 11907 llvm_unreachable("Do not know how to custom type legalize this operation!"); 11908 case ISD::SIGN_EXTEND_INREG: 11909 case ISD::ADDC: 11910 case ISD::ADDE: 11911 case ISD::SUBC: 11912 case ISD::SUBE: 11913 // We don't want to expand or promote these. 11914 return; 11915 case ISD::FP_TO_SINT: 11916 case ISD::FP_TO_UINT: { 11917 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 11918 11919 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) 11920 return; 11921 11922 std::pair<SDValue,SDValue> Vals = 11923 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 11924 SDValue FIST = Vals.first, StackSlot = Vals.second; 11925 if (FIST.getNode() != 0) { 11926 EVT VT = N->getValueType(0); 11927 // Return a load from the stack slot. 11928 if (StackSlot.getNode() != 0) 11929 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 11930 MachinePointerInfo(), 11931 false, false, false, 0)); 11932 else 11933 Results.push_back(FIST); 11934 } 11935 return; 11936 } 11937 case ISD::UINT_TO_FP: { 11938 if (N->getOperand(0).getValueType() != MVT::v2i32 && 11939 N->getValueType(0) != MVT::v2f32) 11940 return; 11941 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, 11942 N->getOperand(0)); 11943 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 11944 MVT::f64); 11945 SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); 11946 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, 11947 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); 11948 Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); 11949 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); 11950 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); 11951 return; 11952 } 11953 case ISD::FP_ROUND: { 11954 if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) 11955 return; 11956 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); 11957 Results.push_back(V); 11958 return; 11959 } 11960 case ISD::READCYCLECOUNTER: { 11961 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11962 SDValue TheChain = N->getOperand(0); 11963 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 11964 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 11965 rd.getValue(1)); 11966 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 11967 eax.getValue(2)); 11968 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 11969 SDValue Ops[] = { eax, edx }; 11970 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 11971 Results.push_back(edx.getValue(1)); 11972 return; 11973 } 11974 case ISD::ATOMIC_CMP_SWAP: { 11975 EVT T = N->getValueType(0); 11976 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 11977 bool Regs64bit = T == MVT::i128; 11978 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 11979 SDValue cpInL, cpInH; 11980 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 11981 DAG.getConstant(0, HalfT)); 11982 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 11983 DAG.getConstant(1, HalfT)); 11984 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 11985 Regs64bit ? X86::RAX : X86::EAX, 11986 cpInL, SDValue()); 11987 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 11988 Regs64bit ? X86::RDX : X86::EDX, 11989 cpInH, cpInL.getValue(1)); 11990 SDValue swapInL, swapInH; 11991 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 11992 DAG.getConstant(0, HalfT)); 11993 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 11994 DAG.getConstant(1, HalfT)); 11995 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 11996 Regs64bit ? X86::RBX : X86::EBX, 11997 swapInL, cpInH.getValue(1)); 11998 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 11999 Regs64bit ? X86::RCX : X86::ECX, 12000 swapInH, swapInL.getValue(1)); 12001 SDValue Ops[] = { swapInH.getValue(0), 12002 N->getOperand(1), 12003 swapInH.getValue(1) }; 12004 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 12005 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 12006 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 12007 X86ISD::LCMPXCHG8_DAG; 12008 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, 12009 Ops, 3, T, MMO); 12010 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 12011 Regs64bit ? X86::RAX : X86::EAX, 12012 HalfT, Result.getValue(1)); 12013 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 12014 Regs64bit ? X86::RDX : X86::EDX, 12015 HalfT, cpOutL.getValue(2)); 12016 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 12017 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); 12018 Results.push_back(cpOutH.getValue(1)); 12019 return; 12020 } 12021 case ISD::ATOMIC_LOAD_ADD: 12022 case ISD::ATOMIC_LOAD_AND: 12023 case ISD::ATOMIC_LOAD_NAND: 12024 case ISD::ATOMIC_LOAD_OR: 12025 case ISD::ATOMIC_LOAD_SUB: 12026 case ISD::ATOMIC_LOAD_XOR: 12027 case ISD::ATOMIC_LOAD_MAX: 12028 case ISD::ATOMIC_LOAD_MIN: 12029 case ISD::ATOMIC_LOAD_UMAX: 12030 case ISD::ATOMIC_LOAD_UMIN: 12031 case ISD::ATOMIC_SWAP: { 12032 unsigned Opc; 12033 switch (N->getOpcode()) { 12034 default: llvm_unreachable("Unexpected opcode"); 12035 case ISD::ATOMIC_LOAD_ADD: 12036 Opc = X86ISD::ATOMADD64_DAG; 12037 break; 12038 case ISD::ATOMIC_LOAD_AND: 12039 Opc = X86ISD::ATOMAND64_DAG; 12040 break; 12041 case ISD::ATOMIC_LOAD_NAND: 12042 Opc = X86ISD::ATOMNAND64_DAG; 12043 break; 12044 case ISD::ATOMIC_LOAD_OR: 12045 Opc = X86ISD::ATOMOR64_DAG; 12046 break; 12047 case ISD::ATOMIC_LOAD_SUB: 12048 Opc = X86ISD::ATOMSUB64_DAG; 12049 break; 12050 case ISD::ATOMIC_LOAD_XOR: 12051 Opc = X86ISD::ATOMXOR64_DAG; 12052 break; 12053 case ISD::ATOMIC_LOAD_MAX: 12054 Opc = X86ISD::ATOMMAX64_DAG; 12055 break; 12056 case ISD::ATOMIC_LOAD_MIN: 12057 Opc = X86ISD::ATOMMIN64_DAG; 12058 break; 12059 case ISD::ATOMIC_LOAD_UMAX: 12060 Opc = X86ISD::ATOMUMAX64_DAG; 12061 break; 12062 case ISD::ATOMIC_LOAD_UMIN: 12063 Opc = X86ISD::ATOMUMIN64_DAG; 12064 break; 12065 case ISD::ATOMIC_SWAP: 12066 Opc = X86ISD::ATOMSWAP64_DAG; 12067 break; 12068 } 12069 ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); 12070 return; 12071 } 12072 case ISD::ATOMIC_LOAD: 12073 ReplaceATOMIC_LOAD(N, Results, DAG); 12074 } 12075} 12076 12077const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 12078 switch (Opcode) { 12079 default: return NULL; 12080 case X86ISD::BSF: return "X86ISD::BSF"; 12081 case X86ISD::BSR: return "X86ISD::BSR"; 12082 case X86ISD::SHLD: return "X86ISD::SHLD"; 12083 case X86ISD::SHRD: return "X86ISD::SHRD"; 12084 case X86ISD::FAND: return "X86ISD::FAND"; 12085 case X86ISD::FOR: return "X86ISD::FOR"; 12086 case X86ISD::FXOR: return "X86ISD::FXOR"; 12087 case X86ISD::FSRL: return "X86ISD::FSRL"; 12088 case X86ISD::FILD: return "X86ISD::FILD"; 12089 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 12090 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 12091 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 12092 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 12093 case X86ISD::FLD: return "X86ISD::FLD"; 12094 case X86ISD::FST: return "X86ISD::FST"; 12095 case X86ISD::CALL: return "X86ISD::CALL"; 12096 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 12097 case X86ISD::BT: return "X86ISD::BT"; 12098 case X86ISD::CMP: return "X86ISD::CMP"; 12099 case X86ISD::COMI: return "X86ISD::COMI"; 12100 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 12101 case X86ISD::SETCC: return "X86ISD::SETCC"; 12102 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 12103 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 12104 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 12105 case X86ISD::CMOV: return "X86ISD::CMOV"; 12106 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 12107 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 12108 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 12109 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 12110 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 12111 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 12112 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 12113 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 12114 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 12115 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 12116 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 12117 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 12118 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 12119 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 12120 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 12121 case X86ISD::BLENDV: return "X86ISD::BLENDV"; 12122 case X86ISD::BLENDI: return "X86ISD::BLENDI"; 12123 case X86ISD::SUBUS: return "X86ISD::SUBUS"; 12124 case X86ISD::HADD: return "X86ISD::HADD"; 12125 case X86ISD::HSUB: return "X86ISD::HSUB"; 12126 case X86ISD::FHADD: return "X86ISD::FHADD"; 12127 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 12128 case X86ISD::UMAX: return "X86ISD::UMAX"; 12129 case X86ISD::UMIN: return "X86ISD::UMIN"; 12130 case X86ISD::SMAX: return "X86ISD::SMAX"; 12131 case X86ISD::SMIN: return "X86ISD::SMIN"; 12132 case X86ISD::FMAX: return "X86ISD::FMAX"; 12133 case X86ISD::FMIN: return "X86ISD::FMIN"; 12134 case X86ISD::FMAXC: return "X86ISD::FMAXC"; 12135 case X86ISD::FMINC: return "X86ISD::FMINC"; 12136 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 12137 case X86ISD::FRCP: return "X86ISD::FRCP"; 12138 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 12139 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; 12140 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 12141 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; 12142 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; 12143 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 12144 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 12145 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 12146 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; 12147 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 12148 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 12149 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 12150 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 12151 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 12152 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 12153 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 12154 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 12155 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 12156 case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; 12157 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 12158 case X86ISD::VZEXT: return "X86ISD::VZEXT"; 12159 case X86ISD::VSEXT: return "X86ISD::VSEXT"; 12160 case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; 12161 case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; 12162 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 12163 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 12164 case X86ISD::VSHL: return "X86ISD::VSHL"; 12165 case X86ISD::VSRL: return "X86ISD::VSRL"; 12166 case X86ISD::VSRA: return "X86ISD::VSRA"; 12167 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 12168 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 12169 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 12170 case X86ISD::CMPP: return "X86ISD::CMPP"; 12171 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 12172 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 12173 case X86ISD::ADD: return "X86ISD::ADD"; 12174 case X86ISD::SUB: return "X86ISD::SUB"; 12175 case X86ISD::ADC: return "X86ISD::ADC"; 12176 case X86ISD::SBB: return "X86ISD::SBB"; 12177 case X86ISD::SMUL: return "X86ISD::SMUL"; 12178 case X86ISD::UMUL: return "X86ISD::UMUL"; 12179 case X86ISD::INC: return "X86ISD::INC"; 12180 case X86ISD::DEC: return "X86ISD::DEC"; 12181 case X86ISD::OR: return "X86ISD::OR"; 12182 case X86ISD::XOR: return "X86ISD::XOR"; 12183 case X86ISD::AND: return "X86ISD::AND"; 12184 case X86ISD::BLSI: return "X86ISD::BLSI"; 12185 case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; 12186 case X86ISD::BLSR: return "X86ISD::BLSR"; 12187 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 12188 case X86ISD::PTEST: return "X86ISD::PTEST"; 12189 case X86ISD::TESTP: return "X86ISD::TESTP"; 12190 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 12191 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 12192 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 12193 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 12194 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 12195 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 12196 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 12197 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 12198 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 12199 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 12200 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 12201 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 12202 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 12203 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 12204 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 12205 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 12206 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 12207 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 12208 case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; 12209 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 12210 case X86ISD::VPERMV: return "X86ISD::VPERMV"; 12211 case X86ISD::VPERMI: return "X86ISD::VPERMI"; 12212 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 12213 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 12214 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 12215 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 12216 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 12217 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 12218 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; 12219 case X86ISD::SAHF: return "X86ISD::SAHF"; 12220 case X86ISD::RDRAND: return "X86ISD::RDRAND"; 12221 case X86ISD::FMADD: return "X86ISD::FMADD"; 12222 case X86ISD::FMSUB: return "X86ISD::FMSUB"; 12223 case X86ISD::FNMADD: return "X86ISD::FNMADD"; 12224 case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; 12225 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; 12226 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; 12227 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; 12228 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; 12229 } 12230} 12231 12232// isLegalAddressingMode - Return true if the addressing mode represented 12233// by AM is legal for this target, for a load/store of the specified type. 12234bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 12235 Type *Ty) const { 12236 // X86 supports extremely general addressing modes. 12237 CodeModel::Model M = getTargetMachine().getCodeModel(); 12238 Reloc::Model R = getTargetMachine().getRelocationModel(); 12239 12240 // X86 allows a sign-extended 32-bit immediate field as a displacement. 12241 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 12242 return false; 12243 12244 if (AM.BaseGV) { 12245 unsigned GVFlags = 12246 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 12247 12248 // If a reference to this global requires an extra load, we can't fold it. 12249 if (isGlobalStubReference(GVFlags)) 12250 return false; 12251 12252 // If BaseGV requires a register for the PIC base, we cannot also have a 12253 // BaseReg specified. 12254 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 12255 return false; 12256 12257 // If lower 4G is not available, then we must use rip-relative addressing. 12258 if ((M != CodeModel::Small || R != Reloc::Static) && 12259 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 12260 return false; 12261 } 12262 12263 switch (AM.Scale) { 12264 case 0: 12265 case 1: 12266 case 2: 12267 case 4: 12268 case 8: 12269 // These scales always work. 12270 break; 12271 case 3: 12272 case 5: 12273 case 9: 12274 // These scales are formed with basereg+scalereg. Only accept if there is 12275 // no basereg yet. 12276 if (AM.HasBaseReg) 12277 return false; 12278 break; 12279 default: // Other stuff never works. 12280 return false; 12281 } 12282 12283 return true; 12284} 12285 12286bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 12287 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 12288 return false; 12289 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 12290 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 12291 if (NumBits1 <= NumBits2) 12292 return false; 12293 return true; 12294} 12295 12296bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { 12297 return Imm == (int32_t)Imm; 12298} 12299 12300bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { 12301 // Can also use sub to handle negated immediates. 12302 return Imm == (int32_t)Imm; 12303} 12304 12305bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 12306 if (!VT1.isInteger() || !VT2.isInteger()) 12307 return false; 12308 unsigned NumBits1 = VT1.getSizeInBits(); 12309 unsigned NumBits2 = VT2.getSizeInBits(); 12310 if (NumBits1 <= NumBits2) 12311 return false; 12312 return true; 12313} 12314 12315bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 12316 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 12317 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 12318} 12319 12320bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 12321 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 12322 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 12323} 12324 12325bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 12326 EVT VT1 = Val.getValueType(); 12327 if (isZExtFree(VT1, VT2)) 12328 return true; 12329 12330 if (Val.getOpcode() != ISD::LOAD) 12331 return false; 12332 12333 if (!VT1.isSimple() || !VT1.isInteger() || 12334 !VT2.isSimple() || !VT2.isInteger()) 12335 return false; 12336 12337 switch (VT1.getSimpleVT().SimpleTy) { 12338 default: break; 12339 case MVT::i8: 12340 case MVT::i16: 12341 case MVT::i32: 12342 // X86 has 8, 16, and 32-bit zero-extending loads. 12343 return true; 12344 } 12345 12346 return false; 12347} 12348 12349bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 12350 // i16 instructions are longer (0x66 prefix) and potentially slower. 12351 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 12352} 12353 12354/// isShuffleMaskLegal - Targets can use this to indicate that they only 12355/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 12356/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 12357/// are assumed to be legal. 12358bool 12359X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 12360 EVT VT) const { 12361 // Very little shuffling can be done for 64-bit vectors right now. 12362 if (VT.getSizeInBits() == 64) 12363 return false; 12364 12365 // FIXME: pshufb, blends, shifts. 12366 return (VT.getVectorNumElements() == 2 || 12367 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 12368 isMOVLMask(M, VT) || 12369 isSHUFPMask(M, VT, Subtarget->hasFp256()) || 12370 isPSHUFDMask(M, VT) || 12371 isPSHUFHWMask(M, VT, Subtarget->hasInt256()) || 12372 isPSHUFLWMask(M, VT, Subtarget->hasInt256()) || 12373 isPALIGNRMask(M, VT, Subtarget) || 12374 isUNPCKLMask(M, VT, Subtarget->hasInt256()) || 12375 isUNPCKHMask(M, VT, Subtarget->hasInt256()) || 12376 isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) || 12377 isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256())); 12378} 12379 12380bool 12381X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 12382 EVT VT) const { 12383 unsigned NumElts = VT.getVectorNumElements(); 12384 // FIXME: This collection of masks seems suspect. 12385 if (NumElts == 2) 12386 return true; 12387 if (NumElts == 4 && VT.is128BitVector()) { 12388 return (isMOVLMask(Mask, VT) || 12389 isCommutedMOVLMask(Mask, VT, true) || 12390 isSHUFPMask(Mask, VT, Subtarget->hasFp256()) || 12391 isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true)); 12392 } 12393 return false; 12394} 12395 12396//===----------------------------------------------------------------------===// 12397// X86 Scheduler Hooks 12398//===----------------------------------------------------------------------===// 12399 12400/// Utility function to emit xbegin specifying the start of an RTM region. 12401static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, 12402 const TargetInstrInfo *TII) { 12403 DebugLoc DL = MI->getDebugLoc(); 12404 12405 const BasicBlock *BB = MBB->getBasicBlock(); 12406 MachineFunction::iterator I = MBB; 12407 ++I; 12408 12409 // For the v = xbegin(), we generate 12410 // 12411 // thisMBB: 12412 // xbegin sinkMBB 12413 // 12414 // mainMBB: 12415 // eax = -1 12416 // 12417 // sinkMBB: 12418 // v = eax 12419 12420 MachineBasicBlock *thisMBB = MBB; 12421 MachineFunction *MF = MBB->getParent(); 12422 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 12423 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 12424 MF->insert(I, mainMBB); 12425 MF->insert(I, sinkMBB); 12426 12427 // Transfer the remainder of BB and its successor edges to sinkMBB. 12428 sinkMBB->splice(sinkMBB->begin(), MBB, 12429 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 12430 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 12431 12432 // thisMBB: 12433 // xbegin sinkMBB 12434 // # fallthrough to mainMBB 12435 // # abortion to sinkMBB 12436 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); 12437 thisMBB->addSuccessor(mainMBB); 12438 thisMBB->addSuccessor(sinkMBB); 12439 12440 // mainMBB: 12441 // EAX = -1 12442 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); 12443 mainMBB->addSuccessor(sinkMBB); 12444 12445 // sinkMBB: 12446 // EAX is live into the sinkMBB 12447 sinkMBB->addLiveIn(X86::EAX); 12448 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 12449 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 12450 .addReg(X86::EAX); 12451 12452 MI->eraseFromParent(); 12453 return sinkMBB; 12454} 12455 12456// Get CMPXCHG opcode for the specified data type. 12457static unsigned getCmpXChgOpcode(EVT VT) { 12458 switch (VT.getSimpleVT().SimpleTy) { 12459 case MVT::i8: return X86::LCMPXCHG8; 12460 case MVT::i16: return X86::LCMPXCHG16; 12461 case MVT::i32: return X86::LCMPXCHG32; 12462 case MVT::i64: return X86::LCMPXCHG64; 12463 default: 12464 break; 12465 } 12466 llvm_unreachable("Invalid operand size!"); 12467} 12468 12469// Get LOAD opcode for the specified data type. 12470static unsigned getLoadOpcode(EVT VT) { 12471 switch (VT.getSimpleVT().SimpleTy) { 12472 case MVT::i8: return X86::MOV8rm; 12473 case MVT::i16: return X86::MOV16rm; 12474 case MVT::i32: return X86::MOV32rm; 12475 case MVT::i64: return X86::MOV64rm; 12476 default: 12477 break; 12478 } 12479 llvm_unreachable("Invalid operand size!"); 12480} 12481 12482// Get opcode of the non-atomic one from the specified atomic instruction. 12483static unsigned getNonAtomicOpcode(unsigned Opc) { 12484 switch (Opc) { 12485 case X86::ATOMAND8: return X86::AND8rr; 12486 case X86::ATOMAND16: return X86::AND16rr; 12487 case X86::ATOMAND32: return X86::AND32rr; 12488 case X86::ATOMAND64: return X86::AND64rr; 12489 case X86::ATOMOR8: return X86::OR8rr; 12490 case X86::ATOMOR16: return X86::OR16rr; 12491 case X86::ATOMOR32: return X86::OR32rr; 12492 case X86::ATOMOR64: return X86::OR64rr; 12493 case X86::ATOMXOR8: return X86::XOR8rr; 12494 case X86::ATOMXOR16: return X86::XOR16rr; 12495 case X86::ATOMXOR32: return X86::XOR32rr; 12496 case X86::ATOMXOR64: return X86::XOR64rr; 12497 } 12498 llvm_unreachable("Unhandled atomic-load-op opcode!"); 12499} 12500 12501// Get opcode of the non-atomic one from the specified atomic instruction with 12502// extra opcode. 12503static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, 12504 unsigned &ExtraOpc) { 12505 switch (Opc) { 12506 case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; 12507 case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; 12508 case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; 12509 case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; 12510 case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; 12511 case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; 12512 case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; 12513 case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; 12514 case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; 12515 case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; 12516 case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; 12517 case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; 12518 case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; 12519 case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; 12520 case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; 12521 case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; 12522 case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; 12523 case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; 12524 case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; 12525 case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; 12526 } 12527 llvm_unreachable("Unhandled atomic-load-op opcode!"); 12528} 12529 12530// Get opcode of the non-atomic one from the specified atomic instruction for 12531// 64-bit data type on 32-bit target. 12532static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { 12533 switch (Opc) { 12534 case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; 12535 case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; 12536 case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; 12537 case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; 12538 case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; 12539 case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; 12540 case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; 12541 case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; 12542 case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; 12543 case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; 12544 } 12545 llvm_unreachable("Unhandled atomic-load-op opcode!"); 12546} 12547 12548// Get opcode of the non-atomic one from the specified atomic instruction for 12549// 64-bit data type on 32-bit target with extra opcode. 12550static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, 12551 unsigned &HiOpc, 12552 unsigned &ExtraOpc) { 12553 switch (Opc) { 12554 case X86::ATOMNAND6432: 12555 ExtraOpc = X86::NOT32r; 12556 HiOpc = X86::AND32rr; 12557 return X86::AND32rr; 12558 } 12559 llvm_unreachable("Unhandled atomic-load-op opcode!"); 12560} 12561 12562// Get pseudo CMOV opcode from the specified data type. 12563static unsigned getPseudoCMOVOpc(EVT VT) { 12564 switch (VT.getSimpleVT().SimpleTy) { 12565 case MVT::i8: return X86::CMOV_GR8; 12566 case MVT::i16: return X86::CMOV_GR16; 12567 case MVT::i32: return X86::CMOV_GR32; 12568 default: 12569 break; 12570 } 12571 llvm_unreachable("Unknown CMOV opcode!"); 12572} 12573 12574// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. 12575// They will be translated into a spin-loop or compare-exchange loop from 12576// 12577// ... 12578// dst = atomic-fetch-op MI.addr, MI.val 12579// ... 12580// 12581// to 12582// 12583// ... 12584// EAX = LOAD MI.addr 12585// loop: 12586// t1 = OP MI.val, EAX 12587// LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] 12588// JNE loop 12589// sink: 12590// dst = EAX 12591// ... 12592MachineBasicBlock * 12593X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, 12594 MachineBasicBlock *MBB) const { 12595 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12596 DebugLoc DL = MI->getDebugLoc(); 12597 12598 MachineFunction *MF = MBB->getParent(); 12599 MachineRegisterInfo &MRI = MF->getRegInfo(); 12600 12601 const BasicBlock *BB = MBB->getBasicBlock(); 12602 MachineFunction::iterator I = MBB; 12603 ++I; 12604 12605 assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 && 12606 "Unexpected number of operands"); 12607 12608 assert(MI->hasOneMemOperand() && 12609 "Expected atomic-load-op to have one memoperand"); 12610 12611 // Memory Reference 12612 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 12613 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 12614 12615 unsigned DstReg, SrcReg; 12616 unsigned MemOpndSlot; 12617 12618 unsigned CurOp = 0; 12619 12620 DstReg = MI->getOperand(CurOp++).getReg(); 12621 MemOpndSlot = CurOp; 12622 CurOp += X86::AddrNumOperands; 12623 SrcReg = MI->getOperand(CurOp++).getReg(); 12624 12625 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 12626 MVT::SimpleValueType VT = *RC->vt_begin(); 12627 unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT); 12628 12629 unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); 12630 unsigned LOADOpc = getLoadOpcode(VT); 12631 12632 // For the atomic load-arith operator, we generate 12633 // 12634 // thisMBB: 12635 // EAX = LOAD [MI.addr] 12636 // mainMBB: 12637 // t1 = OP MI.val, EAX 12638 // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] 12639 // JNE mainMBB 12640 // sinkMBB: 12641 12642 MachineBasicBlock *thisMBB = MBB; 12643 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 12644 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 12645 MF->insert(I, mainMBB); 12646 MF->insert(I, sinkMBB); 12647 12648 MachineInstrBuilder MIB; 12649 12650 // Transfer the remainder of BB and its successor edges to sinkMBB. 12651 sinkMBB->splice(sinkMBB->begin(), MBB, 12652 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 12653 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 12654 12655 // thisMBB: 12656 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg); 12657 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 12658 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 12659 MIB.setMemRefs(MMOBegin, MMOEnd); 12660 12661 thisMBB->addSuccessor(mainMBB); 12662 12663 // mainMBB: 12664 MachineBasicBlock *origMainMBB = mainMBB; 12665 mainMBB->addLiveIn(AccPhyReg); 12666 12667 // Copy AccPhyReg as it is used more than once. 12668 unsigned AccReg = MRI.createVirtualRegister(RC); 12669 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg) 12670 .addReg(AccPhyReg); 12671 12672 unsigned t1 = MRI.createVirtualRegister(RC); 12673 unsigned Opc = MI->getOpcode(); 12674 switch (Opc) { 12675 default: 12676 llvm_unreachable("Unhandled atomic-load-op opcode!"); 12677 case X86::ATOMAND8: 12678 case X86::ATOMAND16: 12679 case X86::ATOMAND32: 12680 case X86::ATOMAND64: 12681 case X86::ATOMOR8: 12682 case X86::ATOMOR16: 12683 case X86::ATOMOR32: 12684 case X86::ATOMOR64: 12685 case X86::ATOMXOR8: 12686 case X86::ATOMXOR16: 12687 case X86::ATOMXOR32: 12688 case X86::ATOMXOR64: { 12689 unsigned ARITHOpc = getNonAtomicOpcode(Opc); 12690 BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg) 12691 .addReg(AccReg); 12692 break; 12693 } 12694 case X86::ATOMNAND8: 12695 case X86::ATOMNAND16: 12696 case X86::ATOMNAND32: 12697 case X86::ATOMNAND64: { 12698 unsigned t2 = MRI.createVirtualRegister(RC); 12699 unsigned NOTOpc; 12700 unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); 12701 BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg) 12702 .addReg(AccReg); 12703 BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2); 12704 break; 12705 } 12706 case X86::ATOMMAX8: 12707 case X86::ATOMMAX16: 12708 case X86::ATOMMAX32: 12709 case X86::ATOMMAX64: 12710 case X86::ATOMMIN8: 12711 case X86::ATOMMIN16: 12712 case X86::ATOMMIN32: 12713 case X86::ATOMMIN64: 12714 case X86::ATOMUMAX8: 12715 case X86::ATOMUMAX16: 12716 case X86::ATOMUMAX32: 12717 case X86::ATOMUMAX64: 12718 case X86::ATOMUMIN8: 12719 case X86::ATOMUMIN16: 12720 case X86::ATOMUMIN32: 12721 case X86::ATOMUMIN64: { 12722 unsigned CMPOpc; 12723 unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); 12724 12725 BuildMI(mainMBB, DL, TII->get(CMPOpc)) 12726 .addReg(SrcReg) 12727 .addReg(AccReg); 12728 12729 if (Subtarget->hasCMov()) { 12730 if (VT != MVT::i8) { 12731 // Native support 12732 BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1) 12733 .addReg(SrcReg) 12734 .addReg(AccReg); 12735 } else { 12736 // Promote i8 to i32 to use CMOV32 12737 const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32); 12738 unsigned SrcReg32 = MRI.createVirtualRegister(RC32); 12739 unsigned AccReg32 = MRI.createVirtualRegister(RC32); 12740 unsigned t2 = MRI.createVirtualRegister(RC32); 12741 12742 unsigned Undef = MRI.createVirtualRegister(RC32); 12743 BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); 12744 12745 BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) 12746 .addReg(Undef) 12747 .addReg(SrcReg) 12748 .addImm(X86::sub_8bit); 12749 BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) 12750 .addReg(Undef) 12751 .addReg(AccReg) 12752 .addImm(X86::sub_8bit); 12753 12754 BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) 12755 .addReg(SrcReg32) 12756 .addReg(AccReg32); 12757 12758 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1) 12759 .addReg(t2, 0, X86::sub_8bit); 12760 } 12761 } else { 12762 // Use pseudo select and lower them. 12763 assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && 12764 "Invalid atomic-load-op transformation!"); 12765 unsigned SelOpc = getPseudoCMOVOpc(VT); 12766 X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); 12767 assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); 12768 MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1) 12769 .addReg(SrcReg).addReg(AccReg) 12770 .addImm(CC); 12771 mainMBB = EmitLoweredSelect(MIB, mainMBB); 12772 } 12773 break; 12774 } 12775 } 12776 12777 // Copy AccPhyReg back from virtual register. 12778 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg) 12779 .addReg(AccReg); 12780 12781 MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); 12782 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 12783 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 12784 MIB.addReg(t1); 12785 MIB.setMemRefs(MMOBegin, MMOEnd); 12786 12787 BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); 12788 12789 mainMBB->addSuccessor(origMainMBB); 12790 mainMBB->addSuccessor(sinkMBB); 12791 12792 // sinkMBB: 12793 sinkMBB->addLiveIn(AccPhyReg); 12794 12795 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 12796 TII->get(TargetOpcode::COPY), DstReg) 12797 .addReg(AccPhyReg); 12798 12799 MI->eraseFromParent(); 12800 return sinkMBB; 12801} 12802 12803// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic 12804// instructions. They will be translated into a spin-loop or compare-exchange 12805// loop from 12806// 12807// ... 12808// dst = atomic-fetch-op MI.addr, MI.val 12809// ... 12810// 12811// to 12812// 12813// ... 12814// EAX = LOAD [MI.addr + 0] 12815// EDX = LOAD [MI.addr + 4] 12816// loop: 12817// EBX = OP MI.val.lo, EAX 12818// ECX = OP MI.val.hi, EDX 12819// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] 12820// JNE loop 12821// sink: 12822// dst = EDX:EAX 12823// ... 12824MachineBasicBlock * 12825X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, 12826 MachineBasicBlock *MBB) const { 12827 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12828 DebugLoc DL = MI->getDebugLoc(); 12829 12830 MachineFunction *MF = MBB->getParent(); 12831 MachineRegisterInfo &MRI = MF->getRegInfo(); 12832 12833 const BasicBlock *BB = MBB->getBasicBlock(); 12834 MachineFunction::iterator I = MBB; 12835 ++I; 12836 12837 assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && 12838 "Unexpected number of operands"); 12839 12840 assert(MI->hasOneMemOperand() && 12841 "Expected atomic-load-op32 to have one memoperand"); 12842 12843 // Memory Reference 12844 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 12845 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 12846 12847 unsigned DstLoReg, DstHiReg; 12848 unsigned SrcLoReg, SrcHiReg; 12849 unsigned MemOpndSlot; 12850 12851 unsigned CurOp = 0; 12852 12853 DstLoReg = MI->getOperand(CurOp++).getReg(); 12854 DstHiReg = MI->getOperand(CurOp++).getReg(); 12855 MemOpndSlot = CurOp; 12856 CurOp += X86::AddrNumOperands; 12857 SrcLoReg = MI->getOperand(CurOp++).getReg(); 12858 SrcHiReg = MI->getOperand(CurOp++).getReg(); 12859 12860 const TargetRegisterClass *RC = &X86::GR32RegClass; 12861 const TargetRegisterClass *RC8 = &X86::GR8RegClass; 12862 12863 unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; 12864 unsigned LOADOpc = X86::MOV32rm; 12865 12866 // For the atomic load-arith operator, we generate 12867 // 12868 // thisMBB: 12869 // EAX = LOAD [MI.addr + 0] 12870 // EDX = LOAD [MI.addr + 4] 12871 // mainMBB: 12872 // EBX = OP MI.vallo, EAX 12873 // ECX = OP MI.valhi, EDX 12874 // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] 12875 // JNE mainMBB 12876 // sinkMBB: 12877 12878 MachineBasicBlock *thisMBB = MBB; 12879 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 12880 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 12881 MF->insert(I, mainMBB); 12882 MF->insert(I, sinkMBB); 12883 12884 MachineInstrBuilder MIB; 12885 12886 // Transfer the remainder of BB and its successor edges to sinkMBB. 12887 sinkMBB->splice(sinkMBB->begin(), MBB, 12888 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 12889 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 12890 12891 // thisMBB: 12892 // Lo 12893 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX); 12894 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 12895 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 12896 MIB.setMemRefs(MMOBegin, MMOEnd); 12897 // Hi 12898 MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX); 12899 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 12900 if (i == X86::AddrDisp) 12901 MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) 12902 else 12903 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 12904 } 12905 MIB.setMemRefs(MMOBegin, MMOEnd); 12906 12907 thisMBB->addSuccessor(mainMBB); 12908 12909 // mainMBB: 12910 MachineBasicBlock *origMainMBB = mainMBB; 12911 mainMBB->addLiveIn(X86::EAX); 12912 mainMBB->addLiveIn(X86::EDX); 12913 12914 // Copy EDX:EAX as they are used more than once. 12915 unsigned LoReg = MRI.createVirtualRegister(RC); 12916 unsigned HiReg = MRI.createVirtualRegister(RC); 12917 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX); 12918 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX); 12919 12920 unsigned t1L = MRI.createVirtualRegister(RC); 12921 unsigned t1H = MRI.createVirtualRegister(RC); 12922 12923 unsigned Opc = MI->getOpcode(); 12924 switch (Opc) { 12925 default: 12926 llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); 12927 case X86::ATOMAND6432: 12928 case X86::ATOMOR6432: 12929 case X86::ATOMXOR6432: 12930 case X86::ATOMADD6432: 12931 case X86::ATOMSUB6432: { 12932 unsigned HiOpc; 12933 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 12934 BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg); 12935 BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg); 12936 break; 12937 } 12938 case X86::ATOMNAND6432: { 12939 unsigned HiOpc, NOTOpc; 12940 unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); 12941 unsigned t2L = MRI.createVirtualRegister(RC); 12942 unsigned t2H = MRI.createVirtualRegister(RC); 12943 BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg); 12944 BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg); 12945 BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L); 12946 BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H); 12947 break; 12948 } 12949 case X86::ATOMMAX6432: 12950 case X86::ATOMMIN6432: 12951 case X86::ATOMUMAX6432: 12952 case X86::ATOMUMIN6432: { 12953 unsigned HiOpc; 12954 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 12955 unsigned cL = MRI.createVirtualRegister(RC8); 12956 unsigned cH = MRI.createVirtualRegister(RC8); 12957 unsigned cL32 = MRI.createVirtualRegister(RC); 12958 unsigned cH32 = MRI.createVirtualRegister(RC); 12959 unsigned cc = MRI.createVirtualRegister(RC); 12960 // cl := cmp src_lo, lo 12961 BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) 12962 .addReg(SrcLoReg).addReg(LoReg); 12963 BuildMI(mainMBB, DL, TII->get(LoOpc), cL); 12964 BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); 12965 // ch := cmp src_hi, hi 12966 BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) 12967 .addReg(SrcHiReg).addReg(HiReg); 12968 BuildMI(mainMBB, DL, TII->get(HiOpc), cH); 12969 BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); 12970 // cc := if (src_hi == hi) ? cl : ch; 12971 if (Subtarget->hasCMov()) { 12972 BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) 12973 .addReg(cH32).addReg(cL32); 12974 } else { 12975 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) 12976 .addReg(cH32).addReg(cL32) 12977 .addImm(X86::COND_E); 12978 mainMBB = EmitLoweredSelect(MIB, mainMBB); 12979 } 12980 BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); 12981 if (Subtarget->hasCMov()) { 12982 BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L) 12983 .addReg(SrcLoReg).addReg(LoReg); 12984 BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H) 12985 .addReg(SrcHiReg).addReg(HiReg); 12986 } else { 12987 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L) 12988 .addReg(SrcLoReg).addReg(LoReg) 12989 .addImm(X86::COND_NE); 12990 mainMBB = EmitLoweredSelect(MIB, mainMBB); 12991 MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H) 12992 .addReg(SrcHiReg).addReg(HiReg) 12993 .addImm(X86::COND_NE); 12994 mainMBB = EmitLoweredSelect(MIB, mainMBB); 12995 } 12996 break; 12997 } 12998 case X86::ATOMSWAP6432: { 12999 unsigned HiOpc; 13000 unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); 13001 BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg); 13002 BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg); 13003 break; 13004 } 13005 } 13006 13007 // Copy EDX:EAX back from HiReg:LoReg 13008 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg); 13009 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg); 13010 // Copy ECX:EBX from t1H:t1L 13011 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L); 13012 BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H); 13013 13014 MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); 13015 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 13016 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 13017 MIB.setMemRefs(MMOBegin, MMOEnd); 13018 13019 BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); 13020 13021 mainMBB->addSuccessor(origMainMBB); 13022 mainMBB->addSuccessor(sinkMBB); 13023 13024 // sinkMBB: 13025 sinkMBB->addLiveIn(X86::EAX); 13026 sinkMBB->addLiveIn(X86::EDX); 13027 13028 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 13029 TII->get(TargetOpcode::COPY), DstLoReg) 13030 .addReg(X86::EAX); 13031 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 13032 TII->get(TargetOpcode::COPY), DstHiReg) 13033 .addReg(X86::EDX); 13034 13035 MI->eraseFromParent(); 13036 return sinkMBB; 13037} 13038 13039// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 13040// or XMM0_V32I8 in AVX all of this code can be replaced with that 13041// in the .td file. 13042static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, 13043 const TargetInstrInfo *TII) { 13044 unsigned Opc; 13045 switch (MI->getOpcode()) { 13046 default: llvm_unreachable("illegal opcode!"); 13047 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; 13048 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; 13049 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; 13050 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; 13051 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; 13052 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; 13053 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; 13054 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; 13055 } 13056 13057 DebugLoc dl = MI->getDebugLoc(); 13058 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 13059 13060 unsigned NumArgs = MI->getNumOperands(); 13061 for (unsigned i = 1; i < NumArgs; ++i) { 13062 MachineOperand &Op = MI->getOperand(i); 13063 if (!(Op.isReg() && Op.isImplicit())) 13064 MIB.addOperand(Op); 13065 } 13066 if (MI->hasOneMemOperand()) 13067 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 13068 13069 BuildMI(*BB, MI, dl, 13070 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 13071 .addReg(X86::XMM0); 13072 13073 MI->eraseFromParent(); 13074 return BB; 13075} 13076 13077// FIXME: Custom handling because TableGen doesn't support multiple implicit 13078// defs in an instruction pattern 13079static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, 13080 const TargetInstrInfo *TII) { 13081 unsigned Opc; 13082 switch (MI->getOpcode()) { 13083 default: llvm_unreachable("illegal opcode!"); 13084 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; 13085 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; 13086 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; 13087 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; 13088 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; 13089 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; 13090 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; 13091 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; 13092 } 13093 13094 DebugLoc dl = MI->getDebugLoc(); 13095 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 13096 13097 unsigned NumArgs = MI->getNumOperands(); // remove the results 13098 for (unsigned i = 1; i < NumArgs; ++i) { 13099 MachineOperand &Op = MI->getOperand(i); 13100 if (!(Op.isReg() && Op.isImplicit())) 13101 MIB.addOperand(Op); 13102 } 13103 if (MI->hasOneMemOperand()) 13104 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 13105 13106 BuildMI(*BB, MI, dl, 13107 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 13108 .addReg(X86::ECX); 13109 13110 MI->eraseFromParent(); 13111 return BB; 13112} 13113 13114static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, 13115 const TargetInstrInfo *TII, 13116 const X86Subtarget* Subtarget) { 13117 DebugLoc dl = MI->getDebugLoc(); 13118 13119 // Address into RAX/EAX, other two args into ECX, EDX. 13120 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 13121 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 13122 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 13123 for (int i = 0; i < X86::AddrNumOperands; ++i) 13124 MIB.addOperand(MI->getOperand(i)); 13125 13126 unsigned ValOps = X86::AddrNumOperands; 13127 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 13128 .addReg(MI->getOperand(ValOps).getReg()); 13129 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 13130 .addReg(MI->getOperand(ValOps+1).getReg()); 13131 13132 // The instruction doesn't actually take any operands though. 13133 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 13134 13135 MI->eraseFromParent(); // The pseudo is gone now. 13136 return BB; 13137} 13138 13139MachineBasicBlock * 13140X86TargetLowering::EmitVAARG64WithCustomInserter( 13141 MachineInstr *MI, 13142 MachineBasicBlock *MBB) const { 13143 // Emit va_arg instruction on X86-64. 13144 13145 // Operands to this pseudo-instruction: 13146 // 0 ) Output : destination address (reg) 13147 // 1-5) Input : va_list address (addr, i64mem) 13148 // 6 ) ArgSize : Size (in bytes) of vararg type 13149 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 13150 // 8 ) Align : Alignment of type 13151 // 9 ) EFLAGS (implicit-def) 13152 13153 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 13154 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 13155 13156 unsigned DestReg = MI->getOperand(0).getReg(); 13157 MachineOperand &Base = MI->getOperand(1); 13158 MachineOperand &Scale = MI->getOperand(2); 13159 MachineOperand &Index = MI->getOperand(3); 13160 MachineOperand &Disp = MI->getOperand(4); 13161 MachineOperand &Segment = MI->getOperand(5); 13162 unsigned ArgSize = MI->getOperand(6).getImm(); 13163 unsigned ArgMode = MI->getOperand(7).getImm(); 13164 unsigned Align = MI->getOperand(8).getImm(); 13165 13166 // Memory Reference 13167 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 13168 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 13169 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 13170 13171 // Machine Information 13172 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 13173 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 13174 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 13175 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 13176 DebugLoc DL = MI->getDebugLoc(); 13177 13178 // struct va_list { 13179 // i32 gp_offset 13180 // i32 fp_offset 13181 // i64 overflow_area (address) 13182 // i64 reg_save_area (address) 13183 // } 13184 // sizeof(va_list) = 24 13185 // alignment(va_list) = 8 13186 13187 unsigned TotalNumIntRegs = 6; 13188 unsigned TotalNumXMMRegs = 8; 13189 bool UseGPOffset = (ArgMode == 1); 13190 bool UseFPOffset = (ArgMode == 2); 13191 unsigned MaxOffset = TotalNumIntRegs * 8 + 13192 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 13193 13194 /* Align ArgSize to a multiple of 8 */ 13195 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 13196 bool NeedsAlign = (Align > 8); 13197 13198 MachineBasicBlock *thisMBB = MBB; 13199 MachineBasicBlock *overflowMBB; 13200 MachineBasicBlock *offsetMBB; 13201 MachineBasicBlock *endMBB; 13202 13203 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 13204 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 13205 unsigned OffsetReg = 0; 13206 13207 if (!UseGPOffset && !UseFPOffset) { 13208 // If we only pull from the overflow region, we don't create a branch. 13209 // We don't need to alter control flow. 13210 OffsetDestReg = 0; // unused 13211 OverflowDestReg = DestReg; 13212 13213 offsetMBB = NULL; 13214 overflowMBB = thisMBB; 13215 endMBB = thisMBB; 13216 } else { 13217 // First emit code to check if gp_offset (or fp_offset) is below the bound. 13218 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 13219 // If not, pull from overflow_area. (branch to overflowMBB) 13220 // 13221 // thisMBB 13222 // | . 13223 // | . 13224 // offsetMBB overflowMBB 13225 // | . 13226 // | . 13227 // endMBB 13228 13229 // Registers for the PHI in endMBB 13230 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 13231 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 13232 13233 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 13234 MachineFunction *MF = MBB->getParent(); 13235 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 13236 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 13237 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 13238 13239 MachineFunction::iterator MBBIter = MBB; 13240 ++MBBIter; 13241 13242 // Insert the new basic blocks 13243 MF->insert(MBBIter, offsetMBB); 13244 MF->insert(MBBIter, overflowMBB); 13245 MF->insert(MBBIter, endMBB); 13246 13247 // Transfer the remainder of MBB and its successor edges to endMBB. 13248 endMBB->splice(endMBB->begin(), thisMBB, 13249 llvm::next(MachineBasicBlock::iterator(MI)), 13250 thisMBB->end()); 13251 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 13252 13253 // Make offsetMBB and overflowMBB successors of thisMBB 13254 thisMBB->addSuccessor(offsetMBB); 13255 thisMBB->addSuccessor(overflowMBB); 13256 13257 // endMBB is a successor of both offsetMBB and overflowMBB 13258 offsetMBB->addSuccessor(endMBB); 13259 overflowMBB->addSuccessor(endMBB); 13260 13261 // Load the offset value into a register 13262 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 13263 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 13264 .addOperand(Base) 13265 .addOperand(Scale) 13266 .addOperand(Index) 13267 .addDisp(Disp, UseFPOffset ? 4 : 0) 13268 .addOperand(Segment) 13269 .setMemRefs(MMOBegin, MMOEnd); 13270 13271 // Check if there is enough room left to pull this argument. 13272 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 13273 .addReg(OffsetReg) 13274 .addImm(MaxOffset + 8 - ArgSizeA8); 13275 13276 // Branch to "overflowMBB" if offset >= max 13277 // Fall through to "offsetMBB" otherwise 13278 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 13279 .addMBB(overflowMBB); 13280 } 13281 13282 // In offsetMBB, emit code to use the reg_save_area. 13283 if (offsetMBB) { 13284 assert(OffsetReg != 0); 13285 13286 // Read the reg_save_area address. 13287 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 13288 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 13289 .addOperand(Base) 13290 .addOperand(Scale) 13291 .addOperand(Index) 13292 .addDisp(Disp, 16) 13293 .addOperand(Segment) 13294 .setMemRefs(MMOBegin, MMOEnd); 13295 13296 // Zero-extend the offset 13297 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 13298 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 13299 .addImm(0) 13300 .addReg(OffsetReg) 13301 .addImm(X86::sub_32bit); 13302 13303 // Add the offset to the reg_save_area to get the final address. 13304 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 13305 .addReg(OffsetReg64) 13306 .addReg(RegSaveReg); 13307 13308 // Compute the offset for the next argument 13309 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 13310 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 13311 .addReg(OffsetReg) 13312 .addImm(UseFPOffset ? 16 : 8); 13313 13314 // Store it back into the va_list. 13315 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 13316 .addOperand(Base) 13317 .addOperand(Scale) 13318 .addOperand(Index) 13319 .addDisp(Disp, UseFPOffset ? 4 : 0) 13320 .addOperand(Segment) 13321 .addReg(NextOffsetReg) 13322 .setMemRefs(MMOBegin, MMOEnd); 13323 13324 // Jump to endMBB 13325 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 13326 .addMBB(endMBB); 13327 } 13328 13329 // 13330 // Emit code to use overflow area 13331 // 13332 13333 // Load the overflow_area address into a register. 13334 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 13335 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 13336 .addOperand(Base) 13337 .addOperand(Scale) 13338 .addOperand(Index) 13339 .addDisp(Disp, 8) 13340 .addOperand(Segment) 13341 .setMemRefs(MMOBegin, MMOEnd); 13342 13343 // If we need to align it, do so. Otherwise, just copy the address 13344 // to OverflowDestReg. 13345 if (NeedsAlign) { 13346 // Align the overflow address 13347 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 13348 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 13349 13350 // aligned_addr = (addr + (align-1)) & ~(align-1) 13351 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 13352 .addReg(OverflowAddrReg) 13353 .addImm(Align-1); 13354 13355 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 13356 .addReg(TmpReg) 13357 .addImm(~(uint64_t)(Align-1)); 13358 } else { 13359 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 13360 .addReg(OverflowAddrReg); 13361 } 13362 13363 // Compute the next overflow address after this argument. 13364 // (the overflow address should be kept 8-byte aligned) 13365 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 13366 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 13367 .addReg(OverflowDestReg) 13368 .addImm(ArgSizeA8); 13369 13370 // Store the new overflow address. 13371 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 13372 .addOperand(Base) 13373 .addOperand(Scale) 13374 .addOperand(Index) 13375 .addDisp(Disp, 8) 13376 .addOperand(Segment) 13377 .addReg(NextAddrReg) 13378 .setMemRefs(MMOBegin, MMOEnd); 13379 13380 // If we branched, emit the PHI to the front of endMBB. 13381 if (offsetMBB) { 13382 BuildMI(*endMBB, endMBB->begin(), DL, 13383 TII->get(X86::PHI), DestReg) 13384 .addReg(OffsetDestReg).addMBB(offsetMBB) 13385 .addReg(OverflowDestReg).addMBB(overflowMBB); 13386 } 13387 13388 // Erase the pseudo instruction 13389 MI->eraseFromParent(); 13390 13391 return endMBB; 13392} 13393 13394MachineBasicBlock * 13395X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 13396 MachineInstr *MI, 13397 MachineBasicBlock *MBB) const { 13398 // Emit code to save XMM registers to the stack. The ABI says that the 13399 // number of registers to save is given in %al, so it's theoretically 13400 // possible to do an indirect jump trick to avoid saving all of them, 13401 // however this code takes a simpler approach and just executes all 13402 // of the stores if %al is non-zero. It's less code, and it's probably 13403 // easier on the hardware branch predictor, and stores aren't all that 13404 // expensive anyway. 13405 13406 // Create the new basic blocks. One block contains all the XMM stores, 13407 // and one block is the final destination regardless of whether any 13408 // stores were performed. 13409 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 13410 MachineFunction *F = MBB->getParent(); 13411 MachineFunction::iterator MBBIter = MBB; 13412 ++MBBIter; 13413 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 13414 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 13415 F->insert(MBBIter, XMMSaveMBB); 13416 F->insert(MBBIter, EndMBB); 13417 13418 // Transfer the remainder of MBB and its successor edges to EndMBB. 13419 EndMBB->splice(EndMBB->begin(), MBB, 13420 llvm::next(MachineBasicBlock::iterator(MI)), 13421 MBB->end()); 13422 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 13423 13424 // The original block will now fall through to the XMM save block. 13425 MBB->addSuccessor(XMMSaveMBB); 13426 // The XMMSaveMBB will fall through to the end block. 13427 XMMSaveMBB->addSuccessor(EndMBB); 13428 13429 // Now add the instructions. 13430 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 13431 DebugLoc DL = MI->getDebugLoc(); 13432 13433 unsigned CountReg = MI->getOperand(0).getReg(); 13434 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 13435 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 13436 13437 if (!Subtarget->isTargetWin64()) { 13438 // If %al is 0, branch around the XMM save block. 13439 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 13440 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 13441 MBB->addSuccessor(EndMBB); 13442 } 13443 13444 unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; 13445 // In the XMM save block, save all the XMM argument registers. 13446 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 13447 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 13448 MachineMemOperand *MMO = 13449 F->getMachineMemOperand( 13450 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 13451 MachineMemOperand::MOStore, 13452 /*Size=*/16, /*Align=*/16); 13453 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 13454 .addFrameIndex(RegSaveFrameIndex) 13455 .addImm(/*Scale=*/1) 13456 .addReg(/*IndexReg=*/0) 13457 .addImm(/*Disp=*/Offset) 13458 .addReg(/*Segment=*/0) 13459 .addReg(MI->getOperand(i).getReg()) 13460 .addMemOperand(MMO); 13461 } 13462 13463 MI->eraseFromParent(); // The pseudo instruction is gone now. 13464 13465 return EndMBB; 13466} 13467 13468// The EFLAGS operand of SelectItr might be missing a kill marker 13469// because there were multiple uses of EFLAGS, and ISel didn't know 13470// which to mark. Figure out whether SelectItr should have had a 13471// kill marker, and set it if it should. Returns the correct kill 13472// marker value. 13473static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 13474 MachineBasicBlock* BB, 13475 const TargetRegisterInfo* TRI) { 13476 // Scan forward through BB for a use/def of EFLAGS. 13477 MachineBasicBlock::iterator miI(llvm::next(SelectItr)); 13478 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 13479 const MachineInstr& mi = *miI; 13480 if (mi.readsRegister(X86::EFLAGS)) 13481 return false; 13482 if (mi.definesRegister(X86::EFLAGS)) 13483 break; // Should have kill-flag - update below. 13484 } 13485 13486 // If we hit the end of the block, check whether EFLAGS is live into a 13487 // successor. 13488 if (miI == BB->end()) { 13489 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 13490 sEnd = BB->succ_end(); 13491 sItr != sEnd; ++sItr) { 13492 MachineBasicBlock* succ = *sItr; 13493 if (succ->isLiveIn(X86::EFLAGS)) 13494 return false; 13495 } 13496 } 13497 13498 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 13499 // out. SelectMI should have a kill flag on EFLAGS. 13500 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 13501 return true; 13502} 13503 13504MachineBasicBlock * 13505X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 13506 MachineBasicBlock *BB) const { 13507 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 13508 DebugLoc DL = MI->getDebugLoc(); 13509 13510 // To "insert" a SELECT_CC instruction, we actually have to insert the 13511 // diamond control-flow pattern. The incoming instruction knows the 13512 // destination vreg to set, the condition code register to branch on, the 13513 // true/false values to select between, and a branch opcode to use. 13514 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 13515 MachineFunction::iterator It = BB; 13516 ++It; 13517 13518 // thisMBB: 13519 // ... 13520 // TrueVal = ... 13521 // cmpTY ccX, r1, r2 13522 // bCC copy1MBB 13523 // fallthrough --> copy0MBB 13524 MachineBasicBlock *thisMBB = BB; 13525 MachineFunction *F = BB->getParent(); 13526 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 13527 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 13528 F->insert(It, copy0MBB); 13529 F->insert(It, sinkMBB); 13530 13531 // If the EFLAGS register isn't dead in the terminator, then claim that it's 13532 // live into the sink and copy blocks. 13533 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 13534 if (!MI->killsRegister(X86::EFLAGS) && 13535 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { 13536 copy0MBB->addLiveIn(X86::EFLAGS); 13537 sinkMBB->addLiveIn(X86::EFLAGS); 13538 } 13539 13540 // Transfer the remainder of BB and its successor edges to sinkMBB. 13541 sinkMBB->splice(sinkMBB->begin(), BB, 13542 llvm::next(MachineBasicBlock::iterator(MI)), 13543 BB->end()); 13544 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 13545 13546 // Add the true and fallthrough blocks as its successors. 13547 BB->addSuccessor(copy0MBB); 13548 BB->addSuccessor(sinkMBB); 13549 13550 // Create the conditional branch instruction. 13551 unsigned Opc = 13552 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 13553 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 13554 13555 // copy0MBB: 13556 // %FalseValue = ... 13557 // # fallthrough to sinkMBB 13558 copy0MBB->addSuccessor(sinkMBB); 13559 13560 // sinkMBB: 13561 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 13562 // ... 13563 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 13564 TII->get(X86::PHI), MI->getOperand(0).getReg()) 13565 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 13566 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 13567 13568 MI->eraseFromParent(); // The pseudo instruction is gone now. 13569 return sinkMBB; 13570} 13571 13572MachineBasicBlock * 13573X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, 13574 bool Is64Bit) const { 13575 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 13576 DebugLoc DL = MI->getDebugLoc(); 13577 MachineFunction *MF = BB->getParent(); 13578 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 13579 13580 assert(getTargetMachine().Options.EnableSegmentedStacks); 13581 13582 unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 13583 unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; 13584 13585 // BB: 13586 // ... [Till the alloca] 13587 // If stacklet is not large enough, jump to mallocMBB 13588 // 13589 // bumpMBB: 13590 // Allocate by subtracting from RSP 13591 // Jump to continueMBB 13592 // 13593 // mallocMBB: 13594 // Allocate by call to runtime 13595 // 13596 // continueMBB: 13597 // ... 13598 // [rest of original BB] 13599 // 13600 13601 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 13602 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 13603 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 13604 13605 MachineRegisterInfo &MRI = MF->getRegInfo(); 13606 const TargetRegisterClass *AddrRegClass = 13607 getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); 13608 13609 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 13610 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 13611 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 13612 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 13613 sizeVReg = MI->getOperand(1).getReg(), 13614 physSPReg = Is64Bit ? X86::RSP : X86::ESP; 13615 13616 MachineFunction::iterator MBBIter = BB; 13617 ++MBBIter; 13618 13619 MF->insert(MBBIter, bumpMBB); 13620 MF->insert(MBBIter, mallocMBB); 13621 MF->insert(MBBIter, continueMBB); 13622 13623 continueMBB->splice(continueMBB->begin(), BB, llvm::next 13624 (MachineBasicBlock::iterator(MI)), BB->end()); 13625 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 13626 13627 // Add code to the main basic block to check if the stack limit has been hit, 13628 // and if so, jump to mallocMBB otherwise to bumpMBB. 13629 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 13630 BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 13631 .addReg(tmpSPVReg).addReg(sizeVReg); 13632 BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) 13633 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 13634 .addReg(SPLimitVReg); 13635 BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); 13636 13637 // bumpMBB simply decreases the stack pointer, since we know the current 13638 // stacklet has enough space. 13639 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 13640 .addReg(SPLimitVReg); 13641 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 13642 .addReg(SPLimitVReg); 13643 BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 13644 13645 // Calls into a routine in libgcc to allocate more space from the heap. 13646 const uint32_t *RegMask = 13647 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 13648 if (Is64Bit) { 13649 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 13650 .addReg(sizeVReg); 13651 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 13652 .addExternalSymbol("__morestack_allocate_stack_space") 13653 .addRegMask(RegMask) 13654 .addReg(X86::RDI, RegState::Implicit) 13655 .addReg(X86::RAX, RegState::ImplicitDefine); 13656 } else { 13657 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 13658 .addImm(12); 13659 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 13660 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 13661 .addExternalSymbol("__morestack_allocate_stack_space") 13662 .addRegMask(RegMask) 13663 .addReg(X86::EAX, RegState::ImplicitDefine); 13664 } 13665 13666 if (!Is64Bit) 13667 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 13668 .addImm(16); 13669 13670 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 13671 .addReg(Is64Bit ? X86::RAX : X86::EAX); 13672 BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 13673 13674 // Set up the CFG correctly. 13675 BB->addSuccessor(bumpMBB); 13676 BB->addSuccessor(mallocMBB); 13677 mallocMBB->addSuccessor(continueMBB); 13678 bumpMBB->addSuccessor(continueMBB); 13679 13680 // Take care of the PHI nodes. 13681 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 13682 MI->getOperand(0).getReg()) 13683 .addReg(mallocPtrVReg).addMBB(mallocMBB) 13684 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 13685 13686 // Delete the original pseudo instruction. 13687 MI->eraseFromParent(); 13688 13689 // And we're done. 13690 return continueMBB; 13691} 13692 13693MachineBasicBlock * 13694X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 13695 MachineBasicBlock *BB) const { 13696 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 13697 DebugLoc DL = MI->getDebugLoc(); 13698 13699 assert(!Subtarget->isTargetEnvMacho()); 13700 13701 // The lowering is pretty easy: we're just emitting the call to _alloca. The 13702 // non-trivial part is impdef of ESP. 13703 13704 if (Subtarget->isTargetWin64()) { 13705 if (Subtarget->isTargetCygMing()) { 13706 // ___chkstk(Mingw64): 13707 // Clobbers R10, R11, RAX and EFLAGS. 13708 // Updates RSP. 13709 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 13710 .addExternalSymbol("___chkstk") 13711 .addReg(X86::RAX, RegState::Implicit) 13712 .addReg(X86::RSP, RegState::Implicit) 13713 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 13714 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 13715 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 13716 } else { 13717 // __chkstk(MSVCRT): does not update stack pointer. 13718 // Clobbers R10, R11 and EFLAGS. 13719 // FIXME: RAX(allocated size) might be reused and not killed. 13720 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 13721 .addExternalSymbol("__chkstk") 13722 .addReg(X86::RAX, RegState::Implicit) 13723 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 13724 // RAX has the offset to subtracted from RSP. 13725 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 13726 .addReg(X86::RSP) 13727 .addReg(X86::RAX); 13728 } 13729 } else { 13730 const char *StackProbeSymbol = 13731 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 13732 13733 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 13734 .addExternalSymbol(StackProbeSymbol) 13735 .addReg(X86::EAX, RegState::Implicit) 13736 .addReg(X86::ESP, RegState::Implicit) 13737 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 13738 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 13739 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 13740 } 13741 13742 MI->eraseFromParent(); // The pseudo instruction is gone now. 13743 return BB; 13744} 13745 13746MachineBasicBlock * 13747X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 13748 MachineBasicBlock *BB) const { 13749 // This is pretty easy. We're taking the value that we received from 13750 // our load from the relocation, sticking it in either RDI (x86-64) 13751 // or EAX and doing an indirect call. The return value will then 13752 // be in the normal return register. 13753 const X86InstrInfo *TII 13754 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 13755 DebugLoc DL = MI->getDebugLoc(); 13756 MachineFunction *F = BB->getParent(); 13757 13758 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 13759 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 13760 13761 // Get a register mask for the lowered call. 13762 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 13763 // proper register mask. 13764 const uint32_t *RegMask = 13765 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 13766 if (Subtarget->is64Bit()) { 13767 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 13768 TII->get(X86::MOV64rm), X86::RDI) 13769 .addReg(X86::RIP) 13770 .addImm(0).addReg(0) 13771 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 13772 MI->getOperand(3).getTargetFlags()) 13773 .addReg(0); 13774 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 13775 addDirectMem(MIB, X86::RDI); 13776 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 13777 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 13778 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 13779 TII->get(X86::MOV32rm), X86::EAX) 13780 .addReg(0) 13781 .addImm(0).addReg(0) 13782 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 13783 MI->getOperand(3).getTargetFlags()) 13784 .addReg(0); 13785 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 13786 addDirectMem(MIB, X86::EAX); 13787 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 13788 } else { 13789 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 13790 TII->get(X86::MOV32rm), X86::EAX) 13791 .addReg(TII->getGlobalBaseReg(F)) 13792 .addImm(0).addReg(0) 13793 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 13794 MI->getOperand(3).getTargetFlags()) 13795 .addReg(0); 13796 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 13797 addDirectMem(MIB, X86::EAX); 13798 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 13799 } 13800 13801 MI->eraseFromParent(); // The pseudo instruction is gone now. 13802 return BB; 13803} 13804 13805MachineBasicBlock * 13806X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, 13807 MachineBasicBlock *MBB) const { 13808 DebugLoc DL = MI->getDebugLoc(); 13809 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 13810 13811 MachineFunction *MF = MBB->getParent(); 13812 MachineRegisterInfo &MRI = MF->getRegInfo(); 13813 13814 const BasicBlock *BB = MBB->getBasicBlock(); 13815 MachineFunction::iterator I = MBB; 13816 ++I; 13817 13818 // Memory Reference 13819 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 13820 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 13821 13822 unsigned DstReg; 13823 unsigned MemOpndSlot = 0; 13824 13825 unsigned CurOp = 0; 13826 13827 DstReg = MI->getOperand(CurOp++).getReg(); 13828 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 13829 assert(RC->hasType(MVT::i32) && "Invalid destination!"); 13830 unsigned mainDstReg = MRI.createVirtualRegister(RC); 13831 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 13832 13833 MemOpndSlot = CurOp; 13834 13835 MVT PVT = getPointerTy(); 13836 assert((PVT == MVT::i64 || PVT == MVT::i32) && 13837 "Invalid Pointer Size!"); 13838 13839 // For v = setjmp(buf), we generate 13840 // 13841 // thisMBB: 13842 // buf[LabelOffset] = restoreMBB 13843 // SjLjSetup restoreMBB 13844 // 13845 // mainMBB: 13846 // v_main = 0 13847 // 13848 // sinkMBB: 13849 // v = phi(main, restore) 13850 // 13851 // restoreMBB: 13852 // v_restore = 1 13853 13854 MachineBasicBlock *thisMBB = MBB; 13855 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 13856 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 13857 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); 13858 MF->insert(I, mainMBB); 13859 MF->insert(I, sinkMBB); 13860 MF->push_back(restoreMBB); 13861 13862 MachineInstrBuilder MIB; 13863 13864 // Transfer the remainder of BB and its successor edges to sinkMBB. 13865 sinkMBB->splice(sinkMBB->begin(), MBB, 13866 llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); 13867 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 13868 13869 // thisMBB: 13870 unsigned PtrStoreOpc = 0; 13871 unsigned LabelReg = 0; 13872 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 13873 Reloc::Model RM = getTargetMachine().getRelocationModel(); 13874 bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) && 13875 (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); 13876 13877 // Prepare IP either in reg or imm. 13878 if (!UseImmLabel) { 13879 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; 13880 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 13881 LabelReg = MRI.createVirtualRegister(PtrRC); 13882 if (Subtarget->is64Bit()) { 13883 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) 13884 .addReg(X86::RIP) 13885 .addImm(0) 13886 .addReg(0) 13887 .addMBB(restoreMBB) 13888 .addReg(0); 13889 } else { 13890 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); 13891 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) 13892 .addReg(XII->getGlobalBaseReg(MF)) 13893 .addImm(0) 13894 .addReg(0) 13895 .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) 13896 .addReg(0); 13897 } 13898 } else 13899 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; 13900 // Store IP 13901 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); 13902 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 13903 if (i == X86::AddrDisp) 13904 MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); 13905 else 13906 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 13907 } 13908 if (!UseImmLabel) 13909 MIB.addReg(LabelReg); 13910 else 13911 MIB.addMBB(restoreMBB); 13912 MIB.setMemRefs(MMOBegin, MMOEnd); 13913 // Setup 13914 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) 13915 .addMBB(restoreMBB); 13916 MIB.addRegMask(RegInfo->getNoPreservedMask()); 13917 thisMBB->addSuccessor(mainMBB); 13918 thisMBB->addSuccessor(restoreMBB); 13919 13920 // mainMBB: 13921 // EAX = 0 13922 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); 13923 mainMBB->addSuccessor(sinkMBB); 13924 13925 // sinkMBB: 13926 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 13927 TII->get(X86::PHI), DstReg) 13928 .addReg(mainDstReg).addMBB(mainMBB) 13929 .addReg(restoreDstReg).addMBB(restoreMBB); 13930 13931 // restoreMBB: 13932 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); 13933 BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); 13934 restoreMBB->addSuccessor(sinkMBB); 13935 13936 MI->eraseFromParent(); 13937 return sinkMBB; 13938} 13939 13940MachineBasicBlock * 13941X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, 13942 MachineBasicBlock *MBB) const { 13943 DebugLoc DL = MI->getDebugLoc(); 13944 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 13945 13946 MachineFunction *MF = MBB->getParent(); 13947 MachineRegisterInfo &MRI = MF->getRegInfo(); 13948 13949 // Memory Reference 13950 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 13951 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 13952 13953 MVT PVT = getPointerTy(); 13954 assert((PVT == MVT::i64 || PVT == MVT::i32) && 13955 "Invalid Pointer Size!"); 13956 13957 const TargetRegisterClass *RC = 13958 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; 13959 unsigned Tmp = MRI.createVirtualRegister(RC); 13960 // Since FP is only updated here but NOT referenced, it's treated as GPR. 13961 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; 13962 unsigned SP = RegInfo->getStackRegister(); 13963 13964 MachineInstrBuilder MIB; 13965 13966 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 13967 const int64_t SPOffset = 2 * PVT.getStoreSize(); 13968 13969 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; 13970 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; 13971 13972 // Reload FP 13973 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); 13974 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 13975 MIB.addOperand(MI->getOperand(i)); 13976 MIB.setMemRefs(MMOBegin, MMOEnd); 13977 // Reload IP 13978 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); 13979 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 13980 if (i == X86::AddrDisp) 13981 MIB.addDisp(MI->getOperand(i), LabelOffset); 13982 else 13983 MIB.addOperand(MI->getOperand(i)); 13984 } 13985 MIB.setMemRefs(MMOBegin, MMOEnd); 13986 // Reload SP 13987 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); 13988 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 13989 if (i == X86::AddrDisp) 13990 MIB.addDisp(MI->getOperand(i), SPOffset); 13991 else 13992 MIB.addOperand(MI->getOperand(i)); 13993 } 13994 MIB.setMemRefs(MMOBegin, MMOEnd); 13995 // Jump 13996 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); 13997 13998 MI->eraseFromParent(); 13999 return MBB; 14000} 14001 14002MachineBasicBlock * 14003X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 14004 MachineBasicBlock *BB) const { 14005 switch (MI->getOpcode()) { 14006 default: llvm_unreachable("Unexpected instr type to insert"); 14007 case X86::TAILJMPd64: 14008 case X86::TAILJMPr64: 14009 case X86::TAILJMPm64: 14010 llvm_unreachable("TAILJMP64 would not be touched here."); 14011 case X86::TCRETURNdi64: 14012 case X86::TCRETURNri64: 14013 case X86::TCRETURNmi64: 14014 return BB; 14015 case X86::WIN_ALLOCA: 14016 return EmitLoweredWinAlloca(MI, BB); 14017 case X86::SEG_ALLOCA_32: 14018 return EmitLoweredSegAlloca(MI, BB, false); 14019 case X86::SEG_ALLOCA_64: 14020 return EmitLoweredSegAlloca(MI, BB, true); 14021 case X86::TLSCall_32: 14022 case X86::TLSCall_64: 14023 return EmitLoweredTLSCall(MI, BB); 14024 case X86::CMOV_GR8: 14025 case X86::CMOV_FR32: 14026 case X86::CMOV_FR64: 14027 case X86::CMOV_V4F32: 14028 case X86::CMOV_V2F64: 14029 case X86::CMOV_V2I64: 14030 case X86::CMOV_V8F32: 14031 case X86::CMOV_V4F64: 14032 case X86::CMOV_V4I64: 14033 case X86::CMOV_GR16: 14034 case X86::CMOV_GR32: 14035 case X86::CMOV_RFP32: 14036 case X86::CMOV_RFP64: 14037 case X86::CMOV_RFP80: 14038 return EmitLoweredSelect(MI, BB); 14039 14040 case X86::FP32_TO_INT16_IN_MEM: 14041 case X86::FP32_TO_INT32_IN_MEM: 14042 case X86::FP32_TO_INT64_IN_MEM: 14043 case X86::FP64_TO_INT16_IN_MEM: 14044 case X86::FP64_TO_INT32_IN_MEM: 14045 case X86::FP64_TO_INT64_IN_MEM: 14046 case X86::FP80_TO_INT16_IN_MEM: 14047 case X86::FP80_TO_INT32_IN_MEM: 14048 case X86::FP80_TO_INT64_IN_MEM: { 14049 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 14050 DebugLoc DL = MI->getDebugLoc(); 14051 14052 // Change the floating point control register to use "round towards zero" 14053 // mode when truncating to an integer value. 14054 MachineFunction *F = BB->getParent(); 14055 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 14056 addFrameReference(BuildMI(*BB, MI, DL, 14057 TII->get(X86::FNSTCW16m)), CWFrameIdx); 14058 14059 // Load the old value of the high byte of the control word... 14060 unsigned OldCW = 14061 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); 14062 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 14063 CWFrameIdx); 14064 14065 // Set the high part to be round to zero... 14066 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 14067 .addImm(0xC7F); 14068 14069 // Reload the modified control word now... 14070 addFrameReference(BuildMI(*BB, MI, DL, 14071 TII->get(X86::FLDCW16m)), CWFrameIdx); 14072 14073 // Restore the memory image of control word to original value 14074 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 14075 .addReg(OldCW); 14076 14077 // Get the X86 opcode to use. 14078 unsigned Opc; 14079 switch (MI->getOpcode()) { 14080 default: llvm_unreachable("illegal opcode!"); 14081 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 14082 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 14083 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 14084 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 14085 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 14086 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 14087 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 14088 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 14089 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 14090 } 14091 14092 X86AddressMode AM; 14093 MachineOperand &Op = MI->getOperand(0); 14094 if (Op.isReg()) { 14095 AM.BaseType = X86AddressMode::RegBase; 14096 AM.Base.Reg = Op.getReg(); 14097 } else { 14098 AM.BaseType = X86AddressMode::FrameIndexBase; 14099 AM.Base.FrameIndex = Op.getIndex(); 14100 } 14101 Op = MI->getOperand(1); 14102 if (Op.isImm()) 14103 AM.Scale = Op.getImm(); 14104 Op = MI->getOperand(2); 14105 if (Op.isImm()) 14106 AM.IndexReg = Op.getImm(); 14107 Op = MI->getOperand(3); 14108 if (Op.isGlobal()) { 14109 AM.GV = Op.getGlobal(); 14110 } else { 14111 AM.Disp = Op.getImm(); 14112 } 14113 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 14114 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 14115 14116 // Reload the original control word now. 14117 addFrameReference(BuildMI(*BB, MI, DL, 14118 TII->get(X86::FLDCW16m)), CWFrameIdx); 14119 14120 MI->eraseFromParent(); // The pseudo instruction is gone now. 14121 return BB; 14122 } 14123 // String/text processing lowering. 14124 case X86::PCMPISTRM128REG: 14125 case X86::VPCMPISTRM128REG: 14126 case X86::PCMPISTRM128MEM: 14127 case X86::VPCMPISTRM128MEM: 14128 case X86::PCMPESTRM128REG: 14129 case X86::VPCMPESTRM128REG: 14130 case X86::PCMPESTRM128MEM: 14131 case X86::VPCMPESTRM128MEM: 14132 assert(Subtarget->hasSSE42() && 14133 "Target must have SSE4.2 or AVX features enabled"); 14134 return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo()); 14135 14136 // String/text processing lowering. 14137 case X86::PCMPISTRIREG: 14138 case X86::VPCMPISTRIREG: 14139 case X86::PCMPISTRIMEM: 14140 case X86::VPCMPISTRIMEM: 14141 case X86::PCMPESTRIREG: 14142 case X86::VPCMPESTRIREG: 14143 case X86::PCMPESTRIMEM: 14144 case X86::VPCMPESTRIMEM: 14145 assert(Subtarget->hasSSE42() && 14146 "Target must have SSE4.2 or AVX features enabled"); 14147 return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo()); 14148 14149 // Thread synchronization. 14150 case X86::MONITOR: 14151 return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget); 14152 14153 // xbegin 14154 case X86::XBEGIN: 14155 return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo()); 14156 14157 // Atomic Lowering. 14158 case X86::ATOMAND8: 14159 case X86::ATOMAND16: 14160 case X86::ATOMAND32: 14161 case X86::ATOMAND64: 14162 // Fall through 14163 case X86::ATOMOR8: 14164 case X86::ATOMOR16: 14165 case X86::ATOMOR32: 14166 case X86::ATOMOR64: 14167 // Fall through 14168 case X86::ATOMXOR16: 14169 case X86::ATOMXOR8: 14170 case X86::ATOMXOR32: 14171 case X86::ATOMXOR64: 14172 // Fall through 14173 case X86::ATOMNAND8: 14174 case X86::ATOMNAND16: 14175 case X86::ATOMNAND32: 14176 case X86::ATOMNAND64: 14177 // Fall through 14178 case X86::ATOMMAX8: 14179 case X86::ATOMMAX16: 14180 case X86::ATOMMAX32: 14181 case X86::ATOMMAX64: 14182 // Fall through 14183 case X86::ATOMMIN8: 14184 case X86::ATOMMIN16: 14185 case X86::ATOMMIN32: 14186 case X86::ATOMMIN64: 14187 // Fall through 14188 case X86::ATOMUMAX8: 14189 case X86::ATOMUMAX16: 14190 case X86::ATOMUMAX32: 14191 case X86::ATOMUMAX64: 14192 // Fall through 14193 case X86::ATOMUMIN8: 14194 case X86::ATOMUMIN16: 14195 case X86::ATOMUMIN32: 14196 case X86::ATOMUMIN64: 14197 return EmitAtomicLoadArith(MI, BB); 14198 14199 // This group does 64-bit operations on a 32-bit host. 14200 case X86::ATOMAND6432: 14201 case X86::ATOMOR6432: 14202 case X86::ATOMXOR6432: 14203 case X86::ATOMNAND6432: 14204 case X86::ATOMADD6432: 14205 case X86::ATOMSUB6432: 14206 case X86::ATOMMAX6432: 14207 case X86::ATOMMIN6432: 14208 case X86::ATOMUMAX6432: 14209 case X86::ATOMUMIN6432: 14210 case X86::ATOMSWAP6432: 14211 return EmitAtomicLoadArith6432(MI, BB); 14212 14213 case X86::VASTART_SAVE_XMM_REGS: 14214 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 14215 14216 case X86::VAARG_64: 14217 return EmitVAARG64WithCustomInserter(MI, BB); 14218 14219 case X86::EH_SjLj_SetJmp32: 14220 case X86::EH_SjLj_SetJmp64: 14221 return emitEHSjLjSetJmp(MI, BB); 14222 14223 case X86::EH_SjLj_LongJmp32: 14224 case X86::EH_SjLj_LongJmp64: 14225 return emitEHSjLjLongJmp(MI, BB); 14226 } 14227} 14228 14229//===----------------------------------------------------------------------===// 14230// X86 Optimization Hooks 14231//===----------------------------------------------------------------------===// 14232 14233void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 14234 APInt &KnownZero, 14235 APInt &KnownOne, 14236 const SelectionDAG &DAG, 14237 unsigned Depth) const { 14238 unsigned BitWidth = KnownZero.getBitWidth(); 14239 unsigned Opc = Op.getOpcode(); 14240 assert((Opc >= ISD::BUILTIN_OP_END || 14241 Opc == ISD::INTRINSIC_WO_CHAIN || 14242 Opc == ISD::INTRINSIC_W_CHAIN || 14243 Opc == ISD::INTRINSIC_VOID) && 14244 "Should use MaskedValueIsZero if you don't know whether Op" 14245 " is a target node!"); 14246 14247 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. 14248 switch (Opc) { 14249 default: break; 14250 case X86ISD::ADD: 14251 case X86ISD::SUB: 14252 case X86ISD::ADC: 14253 case X86ISD::SBB: 14254 case X86ISD::SMUL: 14255 case X86ISD::UMUL: 14256 case X86ISD::INC: 14257 case X86ISD::DEC: 14258 case X86ISD::OR: 14259 case X86ISD::XOR: 14260 case X86ISD::AND: 14261 // These nodes' second result is a boolean. 14262 if (Op.getResNo() == 0) 14263 break; 14264 // Fallthrough 14265 case X86ISD::SETCC: 14266 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 14267 break; 14268 case ISD::INTRINSIC_WO_CHAIN: { 14269 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 14270 unsigned NumLoBits = 0; 14271 switch (IntId) { 14272 default: break; 14273 case Intrinsic::x86_sse_movmsk_ps: 14274 case Intrinsic::x86_avx_movmsk_ps_256: 14275 case Intrinsic::x86_sse2_movmsk_pd: 14276 case Intrinsic::x86_avx_movmsk_pd_256: 14277 case Intrinsic::x86_mmx_pmovmskb: 14278 case Intrinsic::x86_sse2_pmovmskb_128: 14279 case Intrinsic::x86_avx2_pmovmskb: { 14280 // High bits of movmskp{s|d}, pmovmskb are known zero. 14281 switch (IntId) { 14282 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 14283 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 14284 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 14285 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 14286 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 14287 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 14288 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 14289 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; 14290 } 14291 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); 14292 break; 14293 } 14294 } 14295 break; 14296 } 14297 } 14298} 14299 14300unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 14301 unsigned Depth) const { 14302 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 14303 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 14304 return Op.getValueType().getScalarType().getSizeInBits(); 14305 14306 // Fallback case. 14307 return 1; 14308} 14309 14310/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 14311/// node is a GlobalAddress + offset. 14312bool X86TargetLowering::isGAPlusOffset(SDNode *N, 14313 const GlobalValue* &GA, 14314 int64_t &Offset) const { 14315 if (N->getOpcode() == X86ISD::Wrapper) { 14316 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 14317 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 14318 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 14319 return true; 14320 } 14321 } 14322 return TargetLowering::isGAPlusOffset(N, GA, Offset); 14323} 14324 14325/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 14326/// same as extracting the high 128-bit part of 256-bit vector and then 14327/// inserting the result into the low part of a new 256-bit vector 14328static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 14329 EVT VT = SVOp->getValueType(0); 14330 unsigned NumElems = VT.getVectorNumElements(); 14331 14332 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 14333 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) 14334 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 14335 SVOp->getMaskElt(j) >= 0) 14336 return false; 14337 14338 return true; 14339} 14340 14341/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 14342/// same as extracting the low 128-bit part of 256-bit vector and then 14343/// inserting the result into the high part of a new 256-bit vector 14344static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 14345 EVT VT = SVOp->getValueType(0); 14346 unsigned NumElems = VT.getVectorNumElements(); 14347 14348 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 14349 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) 14350 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 14351 SVOp->getMaskElt(j) >= 0) 14352 return false; 14353 14354 return true; 14355} 14356 14357/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 14358static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 14359 TargetLowering::DAGCombinerInfo &DCI, 14360 const X86Subtarget* Subtarget) { 14361 DebugLoc dl = N->getDebugLoc(); 14362 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 14363 SDValue V1 = SVOp->getOperand(0); 14364 SDValue V2 = SVOp->getOperand(1); 14365 EVT VT = SVOp->getValueType(0); 14366 unsigned NumElems = VT.getVectorNumElements(); 14367 14368 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 14369 V2.getOpcode() == ISD::CONCAT_VECTORS) { 14370 // 14371 // 0,0,0,... 14372 // | 14373 // V UNDEF BUILD_VECTOR UNDEF 14374 // \ / \ / 14375 // CONCAT_VECTOR CONCAT_VECTOR 14376 // \ / 14377 // \ / 14378 // RESULT: V + zero extended 14379 // 14380 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 14381 V2.getOperand(1).getOpcode() != ISD::UNDEF || 14382 V1.getOperand(1).getOpcode() != ISD::UNDEF) 14383 return SDValue(); 14384 14385 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 14386 return SDValue(); 14387 14388 // To match the shuffle mask, the first half of the mask should 14389 // be exactly the first vector, and all the rest a splat with the 14390 // first element of the second one. 14391 for (unsigned i = 0; i != NumElems/2; ++i) 14392 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 14393 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 14394 return SDValue(); 14395 14396 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 14397 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 14398 if (Ld->hasNUsesOfValue(1, 0)) { 14399 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 14400 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 14401 SDValue ResNode = 14402 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, 14403 Ld->getMemoryVT(), 14404 Ld->getPointerInfo(), 14405 Ld->getAlignment(), 14406 false/*isVolatile*/, true/*ReadMem*/, 14407 false/*WriteMem*/); 14408 14409 // Make sure the newly-created LOAD is in the same position as Ld in 14410 // terms of dependency. We create a TokenFactor for Ld and ResNode, 14411 // and update uses of Ld's output chain to use the TokenFactor. 14412 if (Ld->hasAnyUseOfValue(1)) { 14413 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 14414 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); 14415 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); 14416 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), 14417 SDValue(ResNode.getNode(), 1)); 14418 } 14419 14420 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); 14421 } 14422 } 14423 14424 // Emit a zeroed vector and insert the desired subvector on its 14425 // first half. 14426 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 14427 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); 14428 return DCI.CombineTo(N, InsV); 14429 } 14430 14431 //===--------------------------------------------------------------------===// 14432 // Combine some shuffles into subvector extracts and inserts: 14433 // 14434 14435 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 14436 if (isShuffleHigh128VectorInsertLow(SVOp)) { 14437 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); 14438 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); 14439 return DCI.CombineTo(N, InsV); 14440 } 14441 14442 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 14443 if (isShuffleLow128VectorInsertHigh(SVOp)) { 14444 SDValue V = Extract128BitVector(V1, 0, DAG, dl); 14445 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); 14446 return DCI.CombineTo(N, InsV); 14447 } 14448 14449 return SDValue(); 14450} 14451 14452/// PerformShuffleCombine - Performs several different shuffle combines. 14453static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 14454 TargetLowering::DAGCombinerInfo &DCI, 14455 const X86Subtarget *Subtarget) { 14456 DebugLoc dl = N->getDebugLoc(); 14457 EVT VT = N->getValueType(0); 14458 14459 // Don't create instructions with illegal types after legalize types has run. 14460 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14461 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 14462 return SDValue(); 14463 14464 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 14465 if (Subtarget->hasFp256() && VT.is256BitVector() && 14466 N->getOpcode() == ISD::VECTOR_SHUFFLE) 14467 return PerformShuffleCombine256(N, DAG, DCI, Subtarget); 14468 14469 // Only handle 128 wide vector from here on. 14470 if (!VT.is128BitVector()) 14471 return SDValue(); 14472 14473 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 14474 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 14475 // consecutive, non-overlapping, and in the right order. 14476 SmallVector<SDValue, 16> Elts; 14477 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 14478 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 14479 14480 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 14481} 14482 14483/// PerformTruncateCombine - Converts truncate operation to 14484/// a sequence of vector shuffle operations. 14485/// It is possible when we truncate 256-bit vector to 128-bit vector 14486static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 14487 TargetLowering::DAGCombinerInfo &DCI, 14488 const X86Subtarget *Subtarget) { 14489 return SDValue(); 14490} 14491 14492/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target 14493/// specific shuffle of a load can be folded into a single element load. 14494/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but 14495/// shuffles have been customed lowered so we need to handle those here. 14496static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, 14497 TargetLowering::DAGCombinerInfo &DCI) { 14498 if (DCI.isBeforeLegalizeOps()) 14499 return SDValue(); 14500 14501 SDValue InVec = N->getOperand(0); 14502 SDValue EltNo = N->getOperand(1); 14503 14504 if (!isa<ConstantSDNode>(EltNo)) 14505 return SDValue(); 14506 14507 EVT VT = InVec.getValueType(); 14508 14509 bool HasShuffleIntoBitcast = false; 14510 if (InVec.getOpcode() == ISD::BITCAST) { 14511 // Don't duplicate a load with other uses. 14512 if (!InVec.hasOneUse()) 14513 return SDValue(); 14514 EVT BCVT = InVec.getOperand(0).getValueType(); 14515 if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) 14516 return SDValue(); 14517 InVec = InVec.getOperand(0); 14518 HasShuffleIntoBitcast = true; 14519 } 14520 14521 if (!isTargetShuffle(InVec.getOpcode())) 14522 return SDValue(); 14523 14524 // Don't duplicate a load with other uses. 14525 if (!InVec.hasOneUse()) 14526 return SDValue(); 14527 14528 SmallVector<int, 16> ShuffleMask; 14529 bool UnaryShuffle; 14530 if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, 14531 UnaryShuffle)) 14532 return SDValue(); 14533 14534 // Select the input vector, guarding against out of range extract vector. 14535 unsigned NumElems = VT.getVectorNumElements(); 14536 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 14537 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; 14538 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) 14539 : InVec.getOperand(1); 14540 14541 // If inputs to shuffle are the same for both ops, then allow 2 uses 14542 unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; 14543 14544 if (LdNode.getOpcode() == ISD::BITCAST) { 14545 // Don't duplicate a load with other uses. 14546 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) 14547 return SDValue(); 14548 14549 AllowedUses = 1; // only allow 1 load use if we have a bitcast 14550 LdNode = LdNode.getOperand(0); 14551 } 14552 14553 if (!ISD::isNormalLoad(LdNode.getNode())) 14554 return SDValue(); 14555 14556 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); 14557 14558 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) 14559 return SDValue(); 14560 14561 if (HasShuffleIntoBitcast) { 14562 // If there's a bitcast before the shuffle, check if the load type and 14563 // alignment is valid. 14564 unsigned Align = LN0->getAlignment(); 14565 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14566 unsigned NewAlign = TLI.getDataLayout()-> 14567 getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); 14568 14569 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 14570 return SDValue(); 14571 } 14572 14573 // All checks match so transform back to vector_shuffle so that DAG combiner 14574 // can finish the job 14575 DebugLoc dl = N->getDebugLoc(); 14576 14577 // Create shuffle node taking into account the case that its a unary shuffle 14578 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); 14579 Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, 14580 InVec.getOperand(0), Shuffle, 14581 &ShuffleMask[0]); 14582 Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 14583 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, 14584 EltNo); 14585} 14586 14587/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 14588/// generation and convert it from being a bunch of shuffles and extracts 14589/// to a simple store and scalar loads to extract the elements. 14590static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 14591 TargetLowering::DAGCombinerInfo &DCI) { 14592 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); 14593 if (NewOp.getNode()) 14594 return NewOp; 14595 14596 SDValue InputVector = N->getOperand(0); 14597 // Detect whether we are trying to convert from mmx to i32 and the bitcast 14598 // from mmx to v2i32 has a single usage. 14599 if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && 14600 InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && 14601 InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) 14602 return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(), 14603 N->getValueType(0), 14604 InputVector.getNode()->getOperand(0)); 14605 14606 // Only operate on vectors of 4 elements, where the alternative shuffling 14607 // gets to be more expensive. 14608 if (InputVector.getValueType() != MVT::v4i32) 14609 return SDValue(); 14610 14611 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 14612 // single use which is a sign-extend or zero-extend, and all elements are 14613 // used. 14614 SmallVector<SDNode *, 4> Uses; 14615 unsigned ExtractedElements = 0; 14616 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 14617 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 14618 if (UI.getUse().getResNo() != InputVector.getResNo()) 14619 return SDValue(); 14620 14621 SDNode *Extract = *UI; 14622 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 14623 return SDValue(); 14624 14625 if (Extract->getValueType(0) != MVT::i32) 14626 return SDValue(); 14627 if (!Extract->hasOneUse()) 14628 return SDValue(); 14629 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 14630 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 14631 return SDValue(); 14632 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 14633 return SDValue(); 14634 14635 // Record which element was extracted. 14636 ExtractedElements |= 14637 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 14638 14639 Uses.push_back(Extract); 14640 } 14641 14642 // If not all the elements were used, this may not be worthwhile. 14643 if (ExtractedElements != 15) 14644 return SDValue(); 14645 14646 // Ok, we've now decided to do the transformation. 14647 DebugLoc dl = InputVector.getDebugLoc(); 14648 14649 // Store the value to a temporary stack slot. 14650 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 14651 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 14652 MachinePointerInfo(), false, false, 0); 14653 14654 // Replace each use (extract) with a load of the appropriate element. 14655 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 14656 UE = Uses.end(); UI != UE; ++UI) { 14657 SDNode *Extract = *UI; 14658 14659 // cOMpute the element's address. 14660 SDValue Idx = Extract->getOperand(1); 14661 unsigned EltSize = 14662 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 14663 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 14664 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14665 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 14666 14667 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 14668 StackPtr, OffsetVal); 14669 14670 // Load the scalar. 14671 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 14672 ScalarAddr, MachinePointerInfo(), 14673 false, false, false, 0); 14674 14675 // Replace the exact with the load. 14676 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 14677 } 14678 14679 // The replacement was made in place; don't return anything. 14680 return SDValue(); 14681} 14682 14683/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. 14684static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, 14685 SDValue RHS, SelectionDAG &DAG, 14686 const X86Subtarget *Subtarget) { 14687 if (!VT.isVector()) 14688 return 0; 14689 14690 switch (VT.getSimpleVT().SimpleTy) { 14691 default: return 0; 14692 case MVT::v32i8: 14693 case MVT::v16i16: 14694 case MVT::v8i32: 14695 if (!Subtarget->hasAVX2()) 14696 return 0; 14697 case MVT::v16i8: 14698 case MVT::v8i16: 14699 case MVT::v4i32: 14700 if (!Subtarget->hasSSE2()) 14701 return 0; 14702 } 14703 14704 // SSE2 has only a small subset of the operations. 14705 bool hasUnsigned = Subtarget->hasSSE41() || 14706 (Subtarget->hasSSE2() && VT == MVT::v16i8); 14707 bool hasSigned = Subtarget->hasSSE41() || 14708 (Subtarget->hasSSE2() && VT == MVT::v8i16); 14709 14710 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 14711 14712 // Check for x CC y ? x : y. 14713 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 14714 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 14715 switch (CC) { 14716 default: break; 14717 case ISD::SETULT: 14718 case ISD::SETULE: 14719 return hasUnsigned ? X86ISD::UMIN : 0; 14720 case ISD::SETUGT: 14721 case ISD::SETUGE: 14722 return hasUnsigned ? X86ISD::UMAX : 0; 14723 case ISD::SETLT: 14724 case ISD::SETLE: 14725 return hasSigned ? X86ISD::SMIN : 0; 14726 case ISD::SETGT: 14727 case ISD::SETGE: 14728 return hasSigned ? X86ISD::SMAX : 0; 14729 } 14730 // Check for x CC y ? y : x -- a min/max with reversed arms. 14731 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 14732 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 14733 switch (CC) { 14734 default: break; 14735 case ISD::SETULT: 14736 case ISD::SETULE: 14737 return hasUnsigned ? X86ISD::UMAX : 0; 14738 case ISD::SETUGT: 14739 case ISD::SETUGE: 14740 return hasUnsigned ? X86ISD::UMIN : 0; 14741 case ISD::SETLT: 14742 case ISD::SETLE: 14743 return hasSigned ? X86ISD::SMAX : 0; 14744 case ISD::SETGT: 14745 case ISD::SETGE: 14746 return hasSigned ? X86ISD::SMIN : 0; 14747 } 14748 } 14749 14750 return 0; 14751} 14752 14753/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 14754/// nodes. 14755static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 14756 TargetLowering::DAGCombinerInfo &DCI, 14757 const X86Subtarget *Subtarget) { 14758 DebugLoc DL = N->getDebugLoc(); 14759 SDValue Cond = N->getOperand(0); 14760 // Get the LHS/RHS of the select. 14761 SDValue LHS = N->getOperand(1); 14762 SDValue RHS = N->getOperand(2); 14763 EVT VT = LHS.getValueType(); 14764 14765 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 14766 // instructions match the semantics of the common C idiom x<y?x:y but not 14767 // x<=y?x:y, because of how they handle negative zero (which can be 14768 // ignored in unsafe-math mode). 14769 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 14770 VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 14771 (Subtarget->hasSSE2() || 14772 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 14773 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 14774 14775 unsigned Opcode = 0; 14776 // Check for x CC y ? x : y. 14777 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 14778 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 14779 switch (CC) { 14780 default: break; 14781 case ISD::SETULT: 14782 // Converting this to a min would handle NaNs incorrectly, and swapping 14783 // the operands would cause it to handle comparisons between positive 14784 // and negative zero incorrectly. 14785 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 14786 if (!DAG.getTarget().Options.UnsafeFPMath && 14787 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 14788 break; 14789 std::swap(LHS, RHS); 14790 } 14791 Opcode = X86ISD::FMIN; 14792 break; 14793 case ISD::SETOLE: 14794 // Converting this to a min would handle comparisons between positive 14795 // and negative zero incorrectly. 14796 if (!DAG.getTarget().Options.UnsafeFPMath && 14797 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 14798 break; 14799 Opcode = X86ISD::FMIN; 14800 break; 14801 case ISD::SETULE: 14802 // Converting this to a min would handle both negative zeros and NaNs 14803 // incorrectly, but we can swap the operands to fix both. 14804 std::swap(LHS, RHS); 14805 case ISD::SETOLT: 14806 case ISD::SETLT: 14807 case ISD::SETLE: 14808 Opcode = X86ISD::FMIN; 14809 break; 14810 14811 case ISD::SETOGE: 14812 // Converting this to a max would handle comparisons between positive 14813 // and negative zero incorrectly. 14814 if (!DAG.getTarget().Options.UnsafeFPMath && 14815 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 14816 break; 14817 Opcode = X86ISD::FMAX; 14818 break; 14819 case ISD::SETUGT: 14820 // Converting this to a max would handle NaNs incorrectly, and swapping 14821 // the operands would cause it to handle comparisons between positive 14822 // and negative zero incorrectly. 14823 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 14824 if (!DAG.getTarget().Options.UnsafeFPMath && 14825 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 14826 break; 14827 std::swap(LHS, RHS); 14828 } 14829 Opcode = X86ISD::FMAX; 14830 break; 14831 case ISD::SETUGE: 14832 // Converting this to a max would handle both negative zeros and NaNs 14833 // incorrectly, but we can swap the operands to fix both. 14834 std::swap(LHS, RHS); 14835 case ISD::SETOGT: 14836 case ISD::SETGT: 14837 case ISD::SETGE: 14838 Opcode = X86ISD::FMAX; 14839 break; 14840 } 14841 // Check for x CC y ? y : x -- a min/max with reversed arms. 14842 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 14843 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 14844 switch (CC) { 14845 default: break; 14846 case ISD::SETOGE: 14847 // Converting this to a min would handle comparisons between positive 14848 // and negative zero incorrectly, and swapping the operands would 14849 // cause it to handle NaNs incorrectly. 14850 if (!DAG.getTarget().Options.UnsafeFPMath && 14851 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 14852 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 14853 break; 14854 std::swap(LHS, RHS); 14855 } 14856 Opcode = X86ISD::FMIN; 14857 break; 14858 case ISD::SETUGT: 14859 // Converting this to a min would handle NaNs incorrectly. 14860 if (!DAG.getTarget().Options.UnsafeFPMath && 14861 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 14862 break; 14863 Opcode = X86ISD::FMIN; 14864 break; 14865 case ISD::SETUGE: 14866 // Converting this to a min would handle both negative zeros and NaNs 14867 // incorrectly, but we can swap the operands to fix both. 14868 std::swap(LHS, RHS); 14869 case ISD::SETOGT: 14870 case ISD::SETGT: 14871 case ISD::SETGE: 14872 Opcode = X86ISD::FMIN; 14873 break; 14874 14875 case ISD::SETULT: 14876 // Converting this to a max would handle NaNs incorrectly. 14877 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 14878 break; 14879 Opcode = X86ISD::FMAX; 14880 break; 14881 case ISD::SETOLE: 14882 // Converting this to a max would handle comparisons between positive 14883 // and negative zero incorrectly, and swapping the operands would 14884 // cause it to handle NaNs incorrectly. 14885 if (!DAG.getTarget().Options.UnsafeFPMath && 14886 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 14887 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 14888 break; 14889 std::swap(LHS, RHS); 14890 } 14891 Opcode = X86ISD::FMAX; 14892 break; 14893 case ISD::SETULE: 14894 // Converting this to a max would handle both negative zeros and NaNs 14895 // incorrectly, but we can swap the operands to fix both. 14896 std::swap(LHS, RHS); 14897 case ISD::SETOLT: 14898 case ISD::SETLT: 14899 case ISD::SETLE: 14900 Opcode = X86ISD::FMAX; 14901 break; 14902 } 14903 } 14904 14905 if (Opcode) 14906 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 14907 } 14908 14909 // If this is a select between two integer constants, try to do some 14910 // optimizations. 14911 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 14912 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 14913 // Don't do this for crazy integer types. 14914 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 14915 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 14916 // so that TrueC (the true value) is larger than FalseC. 14917 bool NeedsCondInvert = false; 14918 14919 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 14920 // Efficiently invertible. 14921 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 14922 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 14923 isa<ConstantSDNode>(Cond.getOperand(1))))) { 14924 NeedsCondInvert = true; 14925 std::swap(TrueC, FalseC); 14926 } 14927 14928 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 14929 if (FalseC->getAPIntValue() == 0 && 14930 TrueC->getAPIntValue().isPowerOf2()) { 14931 if (NeedsCondInvert) // Invert the condition if needed. 14932 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 14933 DAG.getConstant(1, Cond.getValueType())); 14934 14935 // Zero extend the condition if needed. 14936 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 14937 14938 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 14939 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 14940 DAG.getConstant(ShAmt, MVT::i8)); 14941 } 14942 14943 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 14944 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 14945 if (NeedsCondInvert) // Invert the condition if needed. 14946 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 14947 DAG.getConstant(1, Cond.getValueType())); 14948 14949 // Zero extend the condition if needed. 14950 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 14951 FalseC->getValueType(0), Cond); 14952 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 14953 SDValue(FalseC, 0)); 14954 } 14955 14956 // Optimize cases that will turn into an LEA instruction. This requires 14957 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 14958 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 14959 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 14960 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 14961 14962 bool isFastMultiplier = false; 14963 if (Diff < 10) { 14964 switch ((unsigned char)Diff) { 14965 default: break; 14966 case 1: // result = add base, cond 14967 case 2: // result = lea base( , cond*2) 14968 case 3: // result = lea base(cond, cond*2) 14969 case 4: // result = lea base( , cond*4) 14970 case 5: // result = lea base(cond, cond*4) 14971 case 8: // result = lea base( , cond*8) 14972 case 9: // result = lea base(cond, cond*8) 14973 isFastMultiplier = true; 14974 break; 14975 } 14976 } 14977 14978 if (isFastMultiplier) { 14979 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 14980 if (NeedsCondInvert) // Invert the condition if needed. 14981 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 14982 DAG.getConstant(1, Cond.getValueType())); 14983 14984 // Zero extend the condition if needed. 14985 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 14986 Cond); 14987 // Scale the condition by the difference. 14988 if (Diff != 1) 14989 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 14990 DAG.getConstant(Diff, Cond.getValueType())); 14991 14992 // Add the base if non-zero. 14993 if (FalseC->getAPIntValue() != 0) 14994 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 14995 SDValue(FalseC, 0)); 14996 return Cond; 14997 } 14998 } 14999 } 15000 } 15001 15002 // Canonicalize max and min: 15003 // (x > y) ? x : y -> (x >= y) ? x : y 15004 // (x < y) ? x : y -> (x <= y) ? x : y 15005 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 15006 // the need for an extra compare 15007 // against zero. e.g. 15008 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 15009 // subl %esi, %edi 15010 // testl %edi, %edi 15011 // movl $0, %eax 15012 // cmovgl %edi, %eax 15013 // => 15014 // xorl %eax, %eax 15015 // subl %esi, $edi 15016 // cmovsl %eax, %edi 15017 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 15018 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 15019 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 15020 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 15021 switch (CC) { 15022 default: break; 15023 case ISD::SETLT: 15024 case ISD::SETGT: { 15025 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 15026 Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(), 15027 Cond.getOperand(0), Cond.getOperand(1), NewCC); 15028 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 15029 } 15030 } 15031 } 15032 15033 // Match VSELECTs into subs with unsigned saturation. 15034 if (!DCI.isBeforeLegalize() && 15035 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 15036 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. 15037 ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || 15038 (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { 15039 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 15040 15041 // Check if one of the arms of the VSELECT is a zero vector. If it's on the 15042 // left side invert the predicate to simplify logic below. 15043 SDValue Other; 15044 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 15045 Other = RHS; 15046 CC = ISD::getSetCCInverse(CC, true); 15047 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { 15048 Other = LHS; 15049 } 15050 15051 if (Other.getNode() && Other->getNumOperands() == 2 && 15052 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { 15053 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); 15054 SDValue CondRHS = Cond->getOperand(1); 15055 15056 // Look for a general sub with unsigned saturation first. 15057 // x >= y ? x-y : 0 --> subus x, y 15058 // x > y ? x-y : 0 --> subus x, y 15059 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && 15060 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) 15061 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 15062 15063 // If the RHS is a constant we have to reverse the const canonicalization. 15064 // x > C-1 ? x+-C : 0 --> subus x, C 15065 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && 15066 isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { 15067 APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); 15068 if (CondRHS.getConstantOperandVal(0) == -A-1) { 15069 SmallVector<SDValue, 32> V(VT.getVectorNumElements(), 15070 DAG.getConstant(-A, VT.getScalarType())); 15071 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, 15072 DAG.getNode(ISD::BUILD_VECTOR, DL, VT, 15073 V.data(), V.size())); 15074 } 15075 } 15076 15077 // Another special case: If C was a sign bit, the sub has been 15078 // canonicalized into a xor. 15079 // FIXME: Would it be better to use ComputeMaskedBits to determine whether 15080 // it's safe to decanonicalize the xor? 15081 // x s< 0 ? x^C : 0 --> subus x, C 15082 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && 15083 ISD::isBuildVectorAllZeros(CondRHS.getNode()) && 15084 isSplatVector(OpRHS.getNode())) { 15085 APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue(); 15086 if (A.isSignBit()) 15087 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 15088 } 15089 } 15090 } 15091 15092 // Try to match a min/max vector operation. 15093 if (!DCI.isBeforeLegalize() && 15094 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) 15095 if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget)) 15096 return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS); 15097 15098 // If we know that this node is legal then we know that it is going to be 15099 // matched by one of the SSE/AVX BLEND instructions. These instructions only 15100 // depend on the highest bit in each word. Try to use SimplifyDemandedBits 15101 // to simplify previous instructions. 15102 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15103 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 15104 !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { 15105 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); 15106 15107 // Don't optimize vector selects that map to mask-registers. 15108 if (BitWidth == 1) 15109 return SDValue(); 15110 15111 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 15112 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 15113 15114 APInt KnownZero, KnownOne; 15115 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 15116 DCI.isBeforeLegalizeOps()); 15117 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 15118 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) 15119 DCI.CommitTargetLoweringOpt(TLO); 15120 } 15121 15122 return SDValue(); 15123} 15124 15125// Check whether a boolean test is testing a boolean value generated by 15126// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition 15127// code. 15128// 15129// Simplify the following patterns: 15130// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or 15131// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) 15132// to (Op EFLAGS Cond) 15133// 15134// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or 15135// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) 15136// to (Op EFLAGS !Cond) 15137// 15138// where Op could be BRCOND or CMOV. 15139// 15140static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { 15141 // Quit if not CMP and SUB with its value result used. 15142 if (Cmp.getOpcode() != X86ISD::CMP && 15143 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) 15144 return SDValue(); 15145 15146 // Quit if not used as a boolean value. 15147 if (CC != X86::COND_E && CC != X86::COND_NE) 15148 return SDValue(); 15149 15150 // Check CMP operands. One of them should be 0 or 1 and the other should be 15151 // an SetCC or extended from it. 15152 SDValue Op1 = Cmp.getOperand(0); 15153 SDValue Op2 = Cmp.getOperand(1); 15154 15155 SDValue SetCC; 15156 const ConstantSDNode* C = 0; 15157 bool needOppositeCond = (CC == X86::COND_E); 15158 15159 if ((C = dyn_cast<ConstantSDNode>(Op1))) 15160 SetCC = Op2; 15161 else if ((C = dyn_cast<ConstantSDNode>(Op2))) 15162 SetCC = Op1; 15163 else // Quit if all operands are not constants. 15164 return SDValue(); 15165 15166 if (C->getZExtValue() == 1) 15167 needOppositeCond = !needOppositeCond; 15168 else if (C->getZExtValue() != 0) 15169 // Quit if the constant is neither 0 or 1. 15170 return SDValue(); 15171 15172 // Skip 'zext' node. 15173 if (SetCC.getOpcode() == ISD::ZERO_EXTEND) 15174 SetCC = SetCC.getOperand(0); 15175 15176 switch (SetCC.getOpcode()) { 15177 case X86ISD::SETCC: 15178 // Set the condition code or opposite one if necessary. 15179 CC = X86::CondCode(SetCC.getConstantOperandVal(0)); 15180 if (needOppositeCond) 15181 CC = X86::GetOppositeBranchCondition(CC); 15182 return SetCC.getOperand(1); 15183 case X86ISD::CMOV: { 15184 // Check whether false/true value has canonical one, i.e. 0 or 1. 15185 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); 15186 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); 15187 // Quit if true value is not a constant. 15188 if (!TVal) 15189 return SDValue(); 15190 // Quit if false value is not a constant. 15191 if (!FVal) { 15192 // A special case for rdrand, where 0 is set if false cond is found. 15193 SDValue Op = SetCC.getOperand(0); 15194 if (Op.getOpcode() != X86ISD::RDRAND) 15195 return SDValue(); 15196 } 15197 // Quit if false value is not the constant 0 or 1. 15198 bool FValIsFalse = true; 15199 if (FVal && FVal->getZExtValue() != 0) { 15200 if (FVal->getZExtValue() != 1) 15201 return SDValue(); 15202 // If FVal is 1, opposite cond is needed. 15203 needOppositeCond = !needOppositeCond; 15204 FValIsFalse = false; 15205 } 15206 // Quit if TVal is not the constant opposite of FVal. 15207 if (FValIsFalse && TVal->getZExtValue() != 1) 15208 return SDValue(); 15209 if (!FValIsFalse && TVal->getZExtValue() != 0) 15210 return SDValue(); 15211 CC = X86::CondCode(SetCC.getConstantOperandVal(2)); 15212 if (needOppositeCond) 15213 CC = X86::GetOppositeBranchCondition(CC); 15214 return SetCC.getOperand(3); 15215 } 15216 } 15217 15218 return SDValue(); 15219} 15220 15221/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 15222static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 15223 TargetLowering::DAGCombinerInfo &DCI, 15224 const X86Subtarget *Subtarget) { 15225 DebugLoc DL = N->getDebugLoc(); 15226 15227 // If the flag operand isn't dead, don't touch this CMOV. 15228 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 15229 return SDValue(); 15230 15231 SDValue FalseOp = N->getOperand(0); 15232 SDValue TrueOp = N->getOperand(1); 15233 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 15234 SDValue Cond = N->getOperand(3); 15235 15236 if (CC == X86::COND_E || CC == X86::COND_NE) { 15237 switch (Cond.getOpcode()) { 15238 default: break; 15239 case X86ISD::BSR: 15240 case X86ISD::BSF: 15241 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 15242 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 15243 return (CC == X86::COND_E) ? FalseOp : TrueOp; 15244 } 15245 } 15246 15247 SDValue Flags; 15248 15249 Flags = checkBoolTestSetCCCombine(Cond, CC); 15250 if (Flags.getNode() && 15251 // Extra check as FCMOV only supports a subset of X86 cond. 15252 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { 15253 SDValue Ops[] = { FalseOp, TrueOp, 15254 DAG.getConstant(CC, MVT::i8), Flags }; 15255 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), 15256 Ops, array_lengthof(Ops)); 15257 } 15258 15259 // If this is a select between two integer constants, try to do some 15260 // optimizations. Note that the operands are ordered the opposite of SELECT 15261 // operands. 15262 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 15263 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 15264 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 15265 // larger than FalseC (the false value). 15266 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 15267 CC = X86::GetOppositeBranchCondition(CC); 15268 std::swap(TrueC, FalseC); 15269 std::swap(TrueOp, FalseOp); 15270 } 15271 15272 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 15273 // This is efficient for any integer data type (including i8/i16) and 15274 // shift amount. 15275 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 15276 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 15277 DAG.getConstant(CC, MVT::i8), Cond); 15278 15279 // Zero extend the condition if needed. 15280 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 15281 15282 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 15283 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 15284 DAG.getConstant(ShAmt, MVT::i8)); 15285 if (N->getNumValues() == 2) // Dead flag value? 15286 return DCI.CombineTo(N, Cond, SDValue()); 15287 return Cond; 15288 } 15289 15290 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 15291 // for any integer data type, including i8/i16. 15292 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 15293 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 15294 DAG.getConstant(CC, MVT::i8), Cond); 15295 15296 // Zero extend the condition if needed. 15297 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 15298 FalseC->getValueType(0), Cond); 15299 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 15300 SDValue(FalseC, 0)); 15301 15302 if (N->getNumValues() == 2) // Dead flag value? 15303 return DCI.CombineTo(N, Cond, SDValue()); 15304 return Cond; 15305 } 15306 15307 // Optimize cases that will turn into an LEA instruction. This requires 15308 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 15309 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 15310 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 15311 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 15312 15313 bool isFastMultiplier = false; 15314 if (Diff < 10) { 15315 switch ((unsigned char)Diff) { 15316 default: break; 15317 case 1: // result = add base, cond 15318 case 2: // result = lea base( , cond*2) 15319 case 3: // result = lea base(cond, cond*2) 15320 case 4: // result = lea base( , cond*4) 15321 case 5: // result = lea base(cond, cond*4) 15322 case 8: // result = lea base( , cond*8) 15323 case 9: // result = lea base(cond, cond*8) 15324 isFastMultiplier = true; 15325 break; 15326 } 15327 } 15328 15329 if (isFastMultiplier) { 15330 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 15331 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 15332 DAG.getConstant(CC, MVT::i8), Cond); 15333 // Zero extend the condition if needed. 15334 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 15335 Cond); 15336 // Scale the condition by the difference. 15337 if (Diff != 1) 15338 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 15339 DAG.getConstant(Diff, Cond.getValueType())); 15340 15341 // Add the base if non-zero. 15342 if (FalseC->getAPIntValue() != 0) 15343 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 15344 SDValue(FalseC, 0)); 15345 if (N->getNumValues() == 2) // Dead flag value? 15346 return DCI.CombineTo(N, Cond, SDValue()); 15347 return Cond; 15348 } 15349 } 15350 } 15351 } 15352 15353 // Handle these cases: 15354 // (select (x != c), e, c) -> select (x != c), e, x), 15355 // (select (x == c), c, e) -> select (x == c), x, e) 15356 // where the c is an integer constant, and the "select" is the combination 15357 // of CMOV and CMP. 15358 // 15359 // The rationale for this change is that the conditional-move from a constant 15360 // needs two instructions, however, conditional-move from a register needs 15361 // only one instruction. 15362 // 15363 // CAVEAT: By replacing a constant with a symbolic value, it may obscure 15364 // some instruction-combining opportunities. This opt needs to be 15365 // postponed as late as possible. 15366 // 15367 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { 15368 // the DCI.xxxx conditions are provided to postpone the optimization as 15369 // late as possible. 15370 15371 ConstantSDNode *CmpAgainst = 0; 15372 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && 15373 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && 15374 dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) { 15375 15376 if (CC == X86::COND_NE && 15377 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { 15378 CC = X86::GetOppositeBranchCondition(CC); 15379 std::swap(TrueOp, FalseOp); 15380 } 15381 15382 if (CC == X86::COND_E && 15383 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { 15384 SDValue Ops[] = { FalseOp, Cond.getOperand(0), 15385 DAG.getConstant(CC, MVT::i8), Cond }; 15386 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, 15387 array_lengthof(Ops)); 15388 } 15389 } 15390 } 15391 15392 return SDValue(); 15393} 15394 15395/// PerformMulCombine - Optimize a single multiply with constant into two 15396/// in order to implement it with two cheaper instructions, e.g. 15397/// LEA + SHL, LEA + LEA. 15398static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 15399 TargetLowering::DAGCombinerInfo &DCI) { 15400 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 15401 return SDValue(); 15402 15403 EVT VT = N->getValueType(0); 15404 if (VT != MVT::i64) 15405 return SDValue(); 15406 15407 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 15408 if (!C) 15409 return SDValue(); 15410 uint64_t MulAmt = C->getZExtValue(); 15411 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 15412 return SDValue(); 15413 15414 uint64_t MulAmt1 = 0; 15415 uint64_t MulAmt2 = 0; 15416 if ((MulAmt % 9) == 0) { 15417 MulAmt1 = 9; 15418 MulAmt2 = MulAmt / 9; 15419 } else if ((MulAmt % 5) == 0) { 15420 MulAmt1 = 5; 15421 MulAmt2 = MulAmt / 5; 15422 } else if ((MulAmt % 3) == 0) { 15423 MulAmt1 = 3; 15424 MulAmt2 = MulAmt / 3; 15425 } 15426 if (MulAmt2 && 15427 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 15428 DebugLoc DL = N->getDebugLoc(); 15429 15430 if (isPowerOf2_64(MulAmt2) && 15431 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 15432 // If second multiplifer is pow2, issue it first. We want the multiply by 15433 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 15434 // is an add. 15435 std::swap(MulAmt1, MulAmt2); 15436 15437 SDValue NewMul; 15438 if (isPowerOf2_64(MulAmt1)) 15439 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 15440 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 15441 else 15442 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 15443 DAG.getConstant(MulAmt1, VT)); 15444 15445 if (isPowerOf2_64(MulAmt2)) 15446 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 15447 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 15448 else 15449 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 15450 DAG.getConstant(MulAmt2, VT)); 15451 15452 // Do not add new nodes to DAG combiner worklist. 15453 DCI.CombineTo(N, NewMul, false); 15454 } 15455 return SDValue(); 15456} 15457 15458static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 15459 SDValue N0 = N->getOperand(0); 15460 SDValue N1 = N->getOperand(1); 15461 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 15462 EVT VT = N0.getValueType(); 15463 15464 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 15465 // since the result of setcc_c is all zero's or all ones. 15466 if (VT.isInteger() && !VT.isVector() && 15467 N1C && N0.getOpcode() == ISD::AND && 15468 N0.getOperand(1).getOpcode() == ISD::Constant) { 15469 SDValue N00 = N0.getOperand(0); 15470 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 15471 ((N00.getOpcode() == ISD::ANY_EXTEND || 15472 N00.getOpcode() == ISD::ZERO_EXTEND) && 15473 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 15474 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 15475 APInt ShAmt = N1C->getAPIntValue(); 15476 Mask = Mask.shl(ShAmt); 15477 if (Mask != 0) 15478 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 15479 N00, DAG.getConstant(Mask, VT)); 15480 } 15481 } 15482 15483 // Hardware support for vector shifts is sparse which makes us scalarize the 15484 // vector operations in many cases. Also, on sandybridge ADD is faster than 15485 // shl. 15486 // (shl V, 1) -> add V,V 15487 if (isSplatVector(N1.getNode())) { 15488 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 15489 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); 15490 // We shift all of the values by one. In many cases we do not have 15491 // hardware support for this operation. This is better expressed as an ADD 15492 // of two values. 15493 if (N1C && (1 == N1C->getZExtValue())) { 15494 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); 15495 } 15496 } 15497 15498 return SDValue(); 15499} 15500 15501/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 15502/// when possible. 15503static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 15504 TargetLowering::DAGCombinerInfo &DCI, 15505 const X86Subtarget *Subtarget) { 15506 EVT VT = N->getValueType(0); 15507 if (N->getOpcode() == ISD::SHL) { 15508 SDValue V = PerformSHLCombine(N, DAG); 15509 if (V.getNode()) return V; 15510 } 15511 15512 // On X86 with SSE2 support, we can transform this to a vector shift if 15513 // all elements are shifted by the same amount. We can't do this in legalize 15514 // because the a constant vector is typically transformed to a constant pool 15515 // so we have no knowledge of the shift amount. 15516 if (!Subtarget->hasSSE2()) 15517 return SDValue(); 15518 15519 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 15520 (!Subtarget->hasInt256() || 15521 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 15522 return SDValue(); 15523 15524 SDValue ShAmtOp = N->getOperand(1); 15525 EVT EltVT = VT.getVectorElementType(); 15526 DebugLoc DL = N->getDebugLoc(); 15527 SDValue BaseShAmt = SDValue(); 15528 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 15529 unsigned NumElts = VT.getVectorNumElements(); 15530 unsigned i = 0; 15531 for (; i != NumElts; ++i) { 15532 SDValue Arg = ShAmtOp.getOperand(i); 15533 if (Arg.getOpcode() == ISD::UNDEF) continue; 15534 BaseShAmt = Arg; 15535 break; 15536 } 15537 // Handle the case where the build_vector is all undef 15538 // FIXME: Should DAG allow this? 15539 if (i == NumElts) 15540 return SDValue(); 15541 15542 for (; i != NumElts; ++i) { 15543 SDValue Arg = ShAmtOp.getOperand(i); 15544 if (Arg.getOpcode() == ISD::UNDEF) continue; 15545 if (Arg != BaseShAmt) { 15546 return SDValue(); 15547 } 15548 } 15549 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 15550 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 15551 SDValue InVec = ShAmtOp.getOperand(0); 15552 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 15553 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 15554 unsigned i = 0; 15555 for (; i != NumElts; ++i) { 15556 SDValue Arg = InVec.getOperand(i); 15557 if (Arg.getOpcode() == ISD::UNDEF) continue; 15558 BaseShAmt = Arg; 15559 break; 15560 } 15561 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 15562 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 15563 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 15564 if (C->getZExtValue() == SplatIdx) 15565 BaseShAmt = InVec.getOperand(1); 15566 } 15567 } 15568 if (BaseShAmt.getNode() == 0) { 15569 // Don't create instructions with illegal types after legalize 15570 // types has run. 15571 if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) && 15572 !DCI.isBeforeLegalize()) 15573 return SDValue(); 15574 15575 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 15576 DAG.getIntPtrConstant(0)); 15577 } 15578 } else 15579 return SDValue(); 15580 15581 // The shift amount is an i32. 15582 if (EltVT.bitsGT(MVT::i32)) 15583 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 15584 else if (EltVT.bitsLT(MVT::i32)) 15585 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 15586 15587 // The shift amount is identical so we can do a vector shift. 15588 SDValue ValOp = N->getOperand(0); 15589 switch (N->getOpcode()) { 15590 default: 15591 llvm_unreachable("Unknown shift opcode!"); 15592 case ISD::SHL: 15593 switch (VT.getSimpleVT().SimpleTy) { 15594 default: return SDValue(); 15595 case MVT::v2i64: 15596 case MVT::v4i32: 15597 case MVT::v8i16: 15598 case MVT::v4i64: 15599 case MVT::v8i32: 15600 case MVT::v16i16: 15601 return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG); 15602 } 15603 case ISD::SRA: 15604 switch (VT.getSimpleVT().SimpleTy) { 15605 default: return SDValue(); 15606 case MVT::v4i32: 15607 case MVT::v8i16: 15608 case MVT::v8i32: 15609 case MVT::v16i16: 15610 return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG); 15611 } 15612 case ISD::SRL: 15613 switch (VT.getSimpleVT().SimpleTy) { 15614 default: return SDValue(); 15615 case MVT::v2i64: 15616 case MVT::v4i32: 15617 case MVT::v8i16: 15618 case MVT::v4i64: 15619 case MVT::v8i32: 15620 case MVT::v16i16: 15621 return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG); 15622 } 15623 } 15624} 15625 15626// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 15627// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 15628// and friends. Likewise for OR -> CMPNEQSS. 15629static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 15630 TargetLowering::DAGCombinerInfo &DCI, 15631 const X86Subtarget *Subtarget) { 15632 unsigned opcode; 15633 15634 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 15635 // we're requiring SSE2 for both. 15636 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 15637 SDValue N0 = N->getOperand(0); 15638 SDValue N1 = N->getOperand(1); 15639 SDValue CMP0 = N0->getOperand(1); 15640 SDValue CMP1 = N1->getOperand(1); 15641 DebugLoc DL = N->getDebugLoc(); 15642 15643 // The SETCCs should both refer to the same CMP. 15644 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 15645 return SDValue(); 15646 15647 SDValue CMP00 = CMP0->getOperand(0); 15648 SDValue CMP01 = CMP0->getOperand(1); 15649 EVT VT = CMP00.getValueType(); 15650 15651 if (VT == MVT::f32 || VT == MVT::f64) { 15652 bool ExpectingFlags = false; 15653 // Check for any users that want flags: 15654 for (SDNode::use_iterator UI = N->use_begin(), 15655 UE = N->use_end(); 15656 !ExpectingFlags && UI != UE; ++UI) 15657 switch (UI->getOpcode()) { 15658 default: 15659 case ISD::BR_CC: 15660 case ISD::BRCOND: 15661 case ISD::SELECT: 15662 ExpectingFlags = true; 15663 break; 15664 case ISD::CopyToReg: 15665 case ISD::SIGN_EXTEND: 15666 case ISD::ZERO_EXTEND: 15667 case ISD::ANY_EXTEND: 15668 break; 15669 } 15670 15671 if (!ExpectingFlags) { 15672 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 15673 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 15674 15675 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 15676 X86::CondCode tmp = cc0; 15677 cc0 = cc1; 15678 cc1 = tmp; 15679 } 15680 15681 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 15682 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 15683 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 15684 X86ISD::NodeType NTOperator = is64BitFP ? 15685 X86ISD::FSETCCsd : X86ISD::FSETCCss; 15686 // FIXME: need symbolic constants for these magic numbers. 15687 // See X86ATTInstPrinter.cpp:printSSECC(). 15688 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 15689 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 15690 DAG.getConstant(x86cc, MVT::i8)); 15691 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 15692 OnesOrZeroesF); 15693 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 15694 DAG.getConstant(1, MVT::i32)); 15695 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 15696 return OneBitOfTruth; 15697 } 15698 } 15699 } 15700 } 15701 return SDValue(); 15702} 15703 15704/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 15705/// so it can be folded inside ANDNP. 15706static bool CanFoldXORWithAllOnes(const SDNode *N) { 15707 EVT VT = N->getValueType(0); 15708 15709 // Match direct AllOnes for 128 and 256-bit vectors 15710 if (ISD::isBuildVectorAllOnes(N)) 15711 return true; 15712 15713 // Look through a bit convert. 15714 if (N->getOpcode() == ISD::BITCAST) 15715 N = N->getOperand(0).getNode(); 15716 15717 // Sometimes the operand may come from a insert_subvector building a 256-bit 15718 // allones vector 15719 if (VT.is256BitVector() && 15720 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 15721 SDValue V1 = N->getOperand(0); 15722 SDValue V2 = N->getOperand(1); 15723 15724 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 15725 V1.getOperand(0).getOpcode() == ISD::UNDEF && 15726 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 15727 ISD::isBuildVectorAllOnes(V2.getNode())) 15728 return true; 15729 } 15730 15731 return false; 15732} 15733 15734// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized 15735// register. In most cases we actually compare or select YMM-sized registers 15736// and mixing the two types creates horrible code. This method optimizes 15737// some of the transition sequences. 15738static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, 15739 TargetLowering::DAGCombinerInfo &DCI, 15740 const X86Subtarget *Subtarget) { 15741 EVT VT = N->getValueType(0); 15742 if (VT.getSizeInBits() != 256) 15743 return SDValue(); 15744 15745 assert((N->getOpcode() == ISD::ANY_EXTEND || 15746 N->getOpcode() == ISD::ZERO_EXTEND || 15747 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); 15748 15749 SDValue Narrow = N->getOperand(0); 15750 EVT NarrowVT = Narrow->getValueType(0); 15751 if (NarrowVT.getSizeInBits() != 128) 15752 return SDValue(); 15753 15754 if (Narrow->getOpcode() != ISD::XOR && 15755 Narrow->getOpcode() != ISD::AND && 15756 Narrow->getOpcode() != ISD::OR) 15757 return SDValue(); 15758 15759 SDValue N0 = Narrow->getOperand(0); 15760 SDValue N1 = Narrow->getOperand(1); 15761 DebugLoc DL = Narrow->getDebugLoc(); 15762 15763 // The Left side has to be a trunc. 15764 if (N0.getOpcode() != ISD::TRUNCATE) 15765 return SDValue(); 15766 15767 // The type of the truncated inputs. 15768 EVT WideVT = N0->getOperand(0)->getValueType(0); 15769 if (WideVT != VT) 15770 return SDValue(); 15771 15772 // The right side has to be a 'trunc' or a constant vector. 15773 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; 15774 bool RHSConst = (isSplatVector(N1.getNode()) && 15775 isa<ConstantSDNode>(N1->getOperand(0))); 15776 if (!RHSTrunc && !RHSConst) 15777 return SDValue(); 15778 15779 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15780 15781 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) 15782 return SDValue(); 15783 15784 // Set N0 and N1 to hold the inputs to the new wide operation. 15785 N0 = N0->getOperand(0); 15786 if (RHSConst) { 15787 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), 15788 N1->getOperand(0)); 15789 SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); 15790 N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size()); 15791 } else if (RHSTrunc) { 15792 N1 = N1->getOperand(0); 15793 } 15794 15795 // Generate the wide operation. 15796 SDValue Op = DAG.getNode(N->getOpcode(), DL, WideVT, N0, N1); 15797 unsigned Opcode = N->getOpcode(); 15798 switch (Opcode) { 15799 case ISD::ANY_EXTEND: 15800 return Op; 15801 case ISD::ZERO_EXTEND: { 15802 unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); 15803 APInt Mask = APInt::getAllOnesValue(InBits); 15804 Mask = Mask.zext(VT.getScalarType().getSizeInBits()); 15805 return DAG.getNode(ISD::AND, DL, VT, 15806 Op, DAG.getConstant(Mask, VT)); 15807 } 15808 case ISD::SIGN_EXTEND: 15809 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, 15810 Op, DAG.getValueType(NarrowVT)); 15811 default: 15812 llvm_unreachable("Unexpected opcode"); 15813 } 15814} 15815 15816static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 15817 TargetLowering::DAGCombinerInfo &DCI, 15818 const X86Subtarget *Subtarget) { 15819 EVT VT = N->getValueType(0); 15820 if (DCI.isBeforeLegalizeOps()) 15821 return SDValue(); 15822 15823 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 15824 if (R.getNode()) 15825 return R; 15826 15827 // Create BLSI, and BLSR instructions 15828 // BLSI is X & (-X) 15829 // BLSR is X & (X-1) 15830 if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { 15831 SDValue N0 = N->getOperand(0); 15832 SDValue N1 = N->getOperand(1); 15833 DebugLoc DL = N->getDebugLoc(); 15834 15835 // Check LHS for neg 15836 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && 15837 isZero(N0.getOperand(0))) 15838 return DAG.getNode(X86ISD::BLSI, DL, VT, N1); 15839 15840 // Check RHS for neg 15841 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && 15842 isZero(N1.getOperand(0))) 15843 return DAG.getNode(X86ISD::BLSI, DL, VT, N0); 15844 15845 // Check LHS for X-1 15846 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 15847 isAllOnes(N0.getOperand(1))) 15848 return DAG.getNode(X86ISD::BLSR, DL, VT, N1); 15849 15850 // Check RHS for X-1 15851 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 15852 isAllOnes(N1.getOperand(1))) 15853 return DAG.getNode(X86ISD::BLSR, DL, VT, N0); 15854 15855 return SDValue(); 15856 } 15857 15858 // Want to form ANDNP nodes: 15859 // 1) In the hopes of then easily combining them with OR and AND nodes 15860 // to form PBLEND/PSIGN. 15861 // 2) To match ANDN packed intrinsics 15862 if (VT != MVT::v2i64 && VT != MVT::v4i64) 15863 return SDValue(); 15864 15865 SDValue N0 = N->getOperand(0); 15866 SDValue N1 = N->getOperand(1); 15867 DebugLoc DL = N->getDebugLoc(); 15868 15869 // Check LHS for vnot 15870 if (N0.getOpcode() == ISD::XOR && 15871 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 15872 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 15873 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 15874 15875 // Check RHS for vnot 15876 if (N1.getOpcode() == ISD::XOR && 15877 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 15878 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 15879 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 15880 15881 return SDValue(); 15882} 15883 15884static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 15885 TargetLowering::DAGCombinerInfo &DCI, 15886 const X86Subtarget *Subtarget) { 15887 EVT VT = N->getValueType(0); 15888 if (DCI.isBeforeLegalizeOps()) 15889 return SDValue(); 15890 15891 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 15892 if (R.getNode()) 15893 return R; 15894 15895 SDValue N0 = N->getOperand(0); 15896 SDValue N1 = N->getOperand(1); 15897 15898 // look for psign/blend 15899 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 15900 if (!Subtarget->hasSSSE3() || 15901 (VT == MVT::v4i64 && !Subtarget->hasInt256())) 15902 return SDValue(); 15903 15904 // Canonicalize pandn to RHS 15905 if (N0.getOpcode() == X86ISD::ANDNP) 15906 std::swap(N0, N1); 15907 // or (and (m, y), (pandn m, x)) 15908 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 15909 SDValue Mask = N1.getOperand(0); 15910 SDValue X = N1.getOperand(1); 15911 SDValue Y; 15912 if (N0.getOperand(0) == Mask) 15913 Y = N0.getOperand(1); 15914 if (N0.getOperand(1) == Mask) 15915 Y = N0.getOperand(0); 15916 15917 // Check to see if the mask appeared in both the AND and ANDNP and 15918 if (!Y.getNode()) 15919 return SDValue(); 15920 15921 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 15922 // Look through mask bitcast. 15923 if (Mask.getOpcode() == ISD::BITCAST) 15924 Mask = Mask.getOperand(0); 15925 if (X.getOpcode() == ISD::BITCAST) 15926 X = X.getOperand(0); 15927 if (Y.getOpcode() == ISD::BITCAST) 15928 Y = Y.getOperand(0); 15929 15930 EVT MaskVT = Mask.getValueType(); 15931 15932 // Validate that the Mask operand is a vector sra node. 15933 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 15934 // there is no psrai.b 15935 if (Mask.getOpcode() != X86ISD::VSRAI) 15936 return SDValue(); 15937 15938 // Check that the SRA is all signbits. 15939 SDValue SraC = Mask.getOperand(1); 15940 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 15941 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 15942 if ((SraAmt + 1) != EltBits) 15943 return SDValue(); 15944 15945 DebugLoc DL = N->getDebugLoc(); 15946 15947 // We are going to replace the AND, OR, NAND with either BLEND 15948 // or PSIGN, which only look at the MSB. The VSRAI instruction 15949 // does not affect the highest bit, so we can get rid of it. 15950 Mask = Mask.getOperand(0); 15951 15952 // Now we know we at least have a plendvb with the mask val. See if 15953 // we can form a psignb/w/d. 15954 // psign = x.type == y.type == mask.type && y = sub(0, x); 15955 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 15956 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 15957 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 15958 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 15959 "Unsupported VT for PSIGN"); 15960 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask); 15961 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 15962 } 15963 // PBLENDVB only available on SSE 4.1 15964 if (!Subtarget->hasSSE41()) 15965 return SDValue(); 15966 15967 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 15968 15969 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); 15970 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); 15971 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); 15972 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 15973 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 15974 } 15975 } 15976 15977 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 15978 return SDValue(); 15979 15980 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 15981 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 15982 std::swap(N0, N1); 15983 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 15984 return SDValue(); 15985 if (!N0.hasOneUse() || !N1.hasOneUse()) 15986 return SDValue(); 15987 15988 SDValue ShAmt0 = N0.getOperand(1); 15989 if (ShAmt0.getValueType() != MVT::i8) 15990 return SDValue(); 15991 SDValue ShAmt1 = N1.getOperand(1); 15992 if (ShAmt1.getValueType() != MVT::i8) 15993 return SDValue(); 15994 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 15995 ShAmt0 = ShAmt0.getOperand(0); 15996 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 15997 ShAmt1 = ShAmt1.getOperand(0); 15998 15999 DebugLoc DL = N->getDebugLoc(); 16000 unsigned Opc = X86ISD::SHLD; 16001 SDValue Op0 = N0.getOperand(0); 16002 SDValue Op1 = N1.getOperand(0); 16003 if (ShAmt0.getOpcode() == ISD::SUB) { 16004 Opc = X86ISD::SHRD; 16005 std::swap(Op0, Op1); 16006 std::swap(ShAmt0, ShAmt1); 16007 } 16008 16009 unsigned Bits = VT.getSizeInBits(); 16010 if (ShAmt1.getOpcode() == ISD::SUB) { 16011 SDValue Sum = ShAmt1.getOperand(0); 16012 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 16013 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 16014 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 16015 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 16016 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 16017 return DAG.getNode(Opc, DL, VT, 16018 Op0, Op1, 16019 DAG.getNode(ISD::TRUNCATE, DL, 16020 MVT::i8, ShAmt0)); 16021 } 16022 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 16023 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 16024 if (ShAmt0C && 16025 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 16026 return DAG.getNode(Opc, DL, VT, 16027 N0.getOperand(0), N1.getOperand(0), 16028 DAG.getNode(ISD::TRUNCATE, DL, 16029 MVT::i8, ShAmt0)); 16030 } 16031 16032 return SDValue(); 16033} 16034 16035// Generate NEG and CMOV for integer abs. 16036static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 16037 EVT VT = N->getValueType(0); 16038 16039 // Since X86 does not have CMOV for 8-bit integer, we don't convert 16040 // 8-bit integer abs to NEG and CMOV. 16041 if (VT.isInteger() && VT.getSizeInBits() == 8) 16042 return SDValue(); 16043 16044 SDValue N0 = N->getOperand(0); 16045 SDValue N1 = N->getOperand(1); 16046 DebugLoc DL = N->getDebugLoc(); 16047 16048 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 16049 // and change it to SUB and CMOV. 16050 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 16051 N0.getOpcode() == ISD::ADD && 16052 N0.getOperand(1) == N1 && 16053 N1.getOpcode() == ISD::SRA && 16054 N1.getOperand(0) == N0.getOperand(0)) 16055 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 16056 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { 16057 // Generate SUB & CMOV. 16058 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), 16059 DAG.getConstant(0, VT), N0.getOperand(0)); 16060 16061 SDValue Ops[] = { N0.getOperand(0), Neg, 16062 DAG.getConstant(X86::COND_GE, MVT::i8), 16063 SDValue(Neg.getNode(), 1) }; 16064 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), 16065 Ops, array_lengthof(Ops)); 16066 } 16067 return SDValue(); 16068} 16069 16070// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes 16071static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 16072 TargetLowering::DAGCombinerInfo &DCI, 16073 const X86Subtarget *Subtarget) { 16074 EVT VT = N->getValueType(0); 16075 if (DCI.isBeforeLegalizeOps()) 16076 return SDValue(); 16077 16078 if (Subtarget->hasCMov()) { 16079 SDValue RV = performIntegerAbsCombine(N, DAG); 16080 if (RV.getNode()) 16081 return RV; 16082 } 16083 16084 // Try forming BMI if it is available. 16085 if (!Subtarget->hasBMI()) 16086 return SDValue(); 16087 16088 if (VT != MVT::i32 && VT != MVT::i64) 16089 return SDValue(); 16090 16091 assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); 16092 16093 // Create BLSMSK instructions by finding X ^ (X-1) 16094 SDValue N0 = N->getOperand(0); 16095 SDValue N1 = N->getOperand(1); 16096 DebugLoc DL = N->getDebugLoc(); 16097 16098 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 16099 isAllOnes(N0.getOperand(1))) 16100 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); 16101 16102 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 16103 isAllOnes(N1.getOperand(1))) 16104 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); 16105 16106 return SDValue(); 16107} 16108 16109/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 16110static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 16111 TargetLowering::DAGCombinerInfo &DCI, 16112 const X86Subtarget *Subtarget) { 16113 LoadSDNode *Ld = cast<LoadSDNode>(N); 16114 EVT RegVT = Ld->getValueType(0); 16115 EVT MemVT = Ld->getMemoryVT(); 16116 DebugLoc dl = Ld->getDebugLoc(); 16117 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16118 16119 ISD::LoadExtType Ext = Ld->getExtensionType(); 16120 16121 // If this is a vector EXT Load then attempt to optimize it using a 16122 // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the 16123 // expansion is still better than scalar code. 16124 // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll 16125 // emit a shuffle and a arithmetic shift. 16126 // TODO: It is possible to support ZExt by zeroing the undef values 16127 // during the shuffle phase or after the shuffle. 16128 if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() && 16129 (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) { 16130 assert(MemVT != RegVT && "Cannot extend to the same type"); 16131 assert(MemVT.isVector() && "Must load a vector from memory"); 16132 16133 unsigned NumElems = RegVT.getVectorNumElements(); 16134 unsigned RegSz = RegVT.getSizeInBits(); 16135 unsigned MemSz = MemVT.getSizeInBits(); 16136 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 16137 16138 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) 16139 return SDValue(); 16140 16141 // All sizes must be a power of two. 16142 if (!isPowerOf2_32(RegSz * MemSz * NumElems)) 16143 return SDValue(); 16144 16145 // Attempt to load the original value using scalar loads. 16146 // Find the largest scalar type that divides the total loaded size. 16147 MVT SclrLoadTy = MVT::i8; 16148 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 16149 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 16150 MVT Tp = (MVT::SimpleValueType)tp; 16151 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { 16152 SclrLoadTy = Tp; 16153 } 16154 } 16155 16156 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 16157 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && 16158 (64 <= MemSz)) 16159 SclrLoadTy = MVT::f64; 16160 16161 // Calculate the number of scalar loads that we need to perform 16162 // in order to load our vector from memory. 16163 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); 16164 if (Ext == ISD::SEXTLOAD && NumLoads > 1) 16165 return SDValue(); 16166 16167 unsigned loadRegZize = RegSz; 16168 if (Ext == ISD::SEXTLOAD && RegSz == 256) 16169 loadRegZize /= 2; 16170 16171 // Represent our vector as a sequence of elements which are the 16172 // largest scalar that we can load. 16173 EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, 16174 loadRegZize/SclrLoadTy.getSizeInBits()); 16175 16176 // Represent the data using the same element type that is stored in 16177 // memory. In practice, we ''widen'' MemVT. 16178 EVT WideVecVT = 16179 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 16180 loadRegZize/MemVT.getScalarType().getSizeInBits()); 16181 16182 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && 16183 "Invalid vector type"); 16184 16185 // We can't shuffle using an illegal type. 16186 if (!TLI.isTypeLegal(WideVecVT)) 16187 return SDValue(); 16188 16189 SmallVector<SDValue, 8> Chains; 16190 SDValue Ptr = Ld->getBasePtr(); 16191 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, 16192 TLI.getPointerTy()); 16193 SDValue Res = DAG.getUNDEF(LoadUnitVecVT); 16194 16195 for (unsigned i = 0; i < NumLoads; ++i) { 16196 // Perform a single load. 16197 SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), 16198 Ptr, Ld->getPointerInfo(), 16199 Ld->isVolatile(), Ld->isNonTemporal(), 16200 Ld->isInvariant(), Ld->getAlignment()); 16201 Chains.push_back(ScalarLoad.getValue(1)); 16202 // Create the first element type using SCALAR_TO_VECTOR in order to avoid 16203 // another round of DAGCombining. 16204 if (i == 0) 16205 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); 16206 else 16207 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, 16208 ScalarLoad, DAG.getIntPtrConstant(i)); 16209 16210 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 16211 } 16212 16213 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 16214 Chains.size()); 16215 16216 // Bitcast the loaded value to a vector of the original element type, in 16217 // the size of the target vector type. 16218 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); 16219 unsigned SizeRatio = RegSz/MemSz; 16220 16221 if (Ext == ISD::SEXTLOAD) { 16222 // If we have SSE4.1 we can directly emit a VSEXT node. 16223 if (Subtarget->hasSSE41()) { 16224 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); 16225 return DCI.CombineTo(N, Sext, TF, true); 16226 } 16227 16228 // Otherwise we'll shuffle the small elements in the high bits of the 16229 // larger type and perform an arithmetic shift. If the shift is not legal 16230 // it's better to scalarize. 16231 if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT)) 16232 return SDValue(); 16233 16234 // Redistribute the loaded elements into the different locations. 16235 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 16236 for (unsigned i = 0; i != NumElems; ++i) 16237 ShuffleVec[i*SizeRatio + SizeRatio-1] = i; 16238 16239 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 16240 DAG.getUNDEF(WideVecVT), 16241 &ShuffleVec[0]); 16242 16243 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 16244 16245 // Build the arithmetic shift. 16246 unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - 16247 MemVT.getVectorElementType().getSizeInBits(); 16248 SmallVector<SDValue, 8> C(NumElems, 16249 DAG.getConstant(Amt, RegVT.getScalarType())); 16250 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size()); 16251 Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV); 16252 16253 return DCI.CombineTo(N, Shuff, TF, true); 16254 } 16255 16256 // Redistribute the loaded elements into the different locations. 16257 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 16258 for (unsigned i = 0; i != NumElems; ++i) 16259 ShuffleVec[i*SizeRatio] = i; 16260 16261 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 16262 DAG.getUNDEF(WideVecVT), 16263 &ShuffleVec[0]); 16264 16265 // Bitcast to the requested type. 16266 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 16267 // Replace the original load with the new sequence 16268 // and return the new chain. 16269 return DCI.CombineTo(N, Shuff, TF, true); 16270 } 16271 16272 return SDValue(); 16273} 16274 16275/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 16276static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 16277 const X86Subtarget *Subtarget) { 16278 StoreSDNode *St = cast<StoreSDNode>(N); 16279 EVT VT = St->getValue().getValueType(); 16280 EVT StVT = St->getMemoryVT(); 16281 DebugLoc dl = St->getDebugLoc(); 16282 SDValue StoredVal = St->getOperand(1); 16283 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16284 16285 // If we are saving a concatenation of two XMM registers, perform two stores. 16286 // On Sandy Bridge, 256-bit memory operations are executed by two 16287 // 128-bit ports. However, on Haswell it is better to issue a single 256-bit 16288 // memory operation. 16289 if (VT.is256BitVector() && !Subtarget->hasInt256() && 16290 StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && 16291 StoredVal.getNumOperands() == 2) { 16292 SDValue Value0 = StoredVal.getOperand(0); 16293 SDValue Value1 = StoredVal.getOperand(1); 16294 16295 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 16296 SDValue Ptr0 = St->getBasePtr(); 16297 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 16298 16299 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 16300 St->getPointerInfo(), St->isVolatile(), 16301 St->isNonTemporal(), St->getAlignment()); 16302 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 16303 St->getPointerInfo(), St->isVolatile(), 16304 St->isNonTemporal(), St->getAlignment()); 16305 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 16306 } 16307 16308 // Optimize trunc store (of multiple scalars) to shuffle and store. 16309 // First, pack all of the elements in one place. Next, store to memory 16310 // in fewer chunks. 16311 if (St->isTruncatingStore() && VT.isVector()) { 16312 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16313 unsigned NumElems = VT.getVectorNumElements(); 16314 assert(StVT != VT && "Cannot truncate to the same type"); 16315 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 16316 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 16317 16318 // From, To sizes and ElemCount must be pow of two 16319 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 16320 // We are going to use the original vector elt for storing. 16321 // Accumulated smaller vector elements must be a multiple of the store size. 16322 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 16323 16324 unsigned SizeRatio = FromSz / ToSz; 16325 16326 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 16327 16328 // Create a type on which we perform the shuffle 16329 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 16330 StVT.getScalarType(), NumElems*SizeRatio); 16331 16332 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 16333 16334 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 16335 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 16336 for (unsigned i = 0; i != NumElems; ++i) 16337 ShuffleVec[i] = i * SizeRatio; 16338 16339 // Can't shuffle using an illegal type. 16340 if (!TLI.isTypeLegal(WideVecVT)) 16341 return SDValue(); 16342 16343 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 16344 DAG.getUNDEF(WideVecVT), 16345 &ShuffleVec[0]); 16346 // At this point all of the data is stored at the bottom of the 16347 // register. We now need to save it to mem. 16348 16349 // Find the largest store unit 16350 MVT StoreType = MVT::i8; 16351 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 16352 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 16353 MVT Tp = (MVT::SimpleValueType)tp; 16354 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) 16355 StoreType = Tp; 16356 } 16357 16358 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 16359 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && 16360 (64 <= NumElems * ToSz)) 16361 StoreType = MVT::f64; 16362 16363 // Bitcast the original vector into a vector of store-size units 16364 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 16365 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); 16366 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 16367 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 16368 SmallVector<SDValue, 8> Chains; 16369 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 16370 TLI.getPointerTy()); 16371 SDValue Ptr = St->getBasePtr(); 16372 16373 // Perform one or more big stores into memory. 16374 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { 16375 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 16376 StoreType, ShuffWide, 16377 DAG.getIntPtrConstant(i)); 16378 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 16379 St->getPointerInfo(), St->isVolatile(), 16380 St->isNonTemporal(), St->getAlignment()); 16381 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 16382 Chains.push_back(Ch); 16383 } 16384 16385 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 16386 Chains.size()); 16387 } 16388 16389 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 16390 // the FP state in cases where an emms may be missing. 16391 // A preferable solution to the general problem is to figure out the right 16392 // places to insert EMMS. This qualifies as a quick hack. 16393 16394 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 16395 if (VT.getSizeInBits() != 64) 16396 return SDValue(); 16397 16398 const Function *F = DAG.getMachineFunction().getFunction(); 16399 bool NoImplicitFloatOps = F->getFnAttributes(). 16400 hasAttribute(Attribute::NoImplicitFloat); 16401 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps 16402 && Subtarget->hasSSE2(); 16403 if ((VT.isVector() || 16404 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 16405 isa<LoadSDNode>(St->getValue()) && 16406 !cast<LoadSDNode>(St->getValue())->isVolatile() && 16407 St->getChain().hasOneUse() && !St->isVolatile()) { 16408 SDNode* LdVal = St->getValue().getNode(); 16409 LoadSDNode *Ld = 0; 16410 int TokenFactorIndex = -1; 16411 SmallVector<SDValue, 8> Ops; 16412 SDNode* ChainVal = St->getChain().getNode(); 16413 // Must be a store of a load. We currently handle two cases: the load 16414 // is a direct child, and it's under an intervening TokenFactor. It is 16415 // possible to dig deeper under nested TokenFactors. 16416 if (ChainVal == LdVal) 16417 Ld = cast<LoadSDNode>(St->getChain()); 16418 else if (St->getValue().hasOneUse() && 16419 ChainVal->getOpcode() == ISD::TokenFactor) { 16420 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 16421 if (ChainVal->getOperand(i).getNode() == LdVal) { 16422 TokenFactorIndex = i; 16423 Ld = cast<LoadSDNode>(St->getValue()); 16424 } else 16425 Ops.push_back(ChainVal->getOperand(i)); 16426 } 16427 } 16428 16429 if (!Ld || !ISD::isNormalLoad(Ld)) 16430 return SDValue(); 16431 16432 // If this is not the MMX case, i.e. we are just turning i64 load/store 16433 // into f64 load/store, avoid the transformation if there are multiple 16434 // uses of the loaded value. 16435 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 16436 return SDValue(); 16437 16438 DebugLoc LdDL = Ld->getDebugLoc(); 16439 DebugLoc StDL = N->getDebugLoc(); 16440 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 16441 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 16442 // pair instead. 16443 if (Subtarget->is64Bit() || F64IsLegal) { 16444 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 16445 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 16446 Ld->getPointerInfo(), Ld->isVolatile(), 16447 Ld->isNonTemporal(), Ld->isInvariant(), 16448 Ld->getAlignment()); 16449 SDValue NewChain = NewLd.getValue(1); 16450 if (TokenFactorIndex != -1) { 16451 Ops.push_back(NewChain); 16452 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 16453 Ops.size()); 16454 } 16455 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 16456 St->getPointerInfo(), 16457 St->isVolatile(), St->isNonTemporal(), 16458 St->getAlignment()); 16459 } 16460 16461 // Otherwise, lower to two pairs of 32-bit loads / stores. 16462 SDValue LoAddr = Ld->getBasePtr(); 16463 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 16464 DAG.getConstant(4, MVT::i32)); 16465 16466 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 16467 Ld->getPointerInfo(), 16468 Ld->isVolatile(), Ld->isNonTemporal(), 16469 Ld->isInvariant(), Ld->getAlignment()); 16470 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 16471 Ld->getPointerInfo().getWithOffset(4), 16472 Ld->isVolatile(), Ld->isNonTemporal(), 16473 Ld->isInvariant(), 16474 MinAlign(Ld->getAlignment(), 4)); 16475 16476 SDValue NewChain = LoLd.getValue(1); 16477 if (TokenFactorIndex != -1) { 16478 Ops.push_back(LoLd); 16479 Ops.push_back(HiLd); 16480 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 16481 Ops.size()); 16482 } 16483 16484 LoAddr = St->getBasePtr(); 16485 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 16486 DAG.getConstant(4, MVT::i32)); 16487 16488 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 16489 St->getPointerInfo(), 16490 St->isVolatile(), St->isNonTemporal(), 16491 St->getAlignment()); 16492 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 16493 St->getPointerInfo().getWithOffset(4), 16494 St->isVolatile(), 16495 St->isNonTemporal(), 16496 MinAlign(St->getAlignment(), 4)); 16497 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 16498 } 16499 return SDValue(); 16500} 16501 16502/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" 16503/// and return the operands for the horizontal operation in LHS and RHS. A 16504/// horizontal operation performs the binary operation on successive elements 16505/// of its first operand, then on successive elements of its second operand, 16506/// returning the resulting values in a vector. For example, if 16507/// A = < float a0, float a1, float a2, float a3 > 16508/// and 16509/// B = < float b0, float b1, float b2, float b3 > 16510/// then the result of doing a horizontal operation on A and B is 16511/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 16512/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 16513/// A horizontal-op B, for some already available A and B, and if so then LHS is 16514/// set to A, RHS to B, and the routine returns 'true'. 16515/// Note that the binary operation should have the property that if one of the 16516/// operands is UNDEF then the result is UNDEF. 16517static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 16518 // Look for the following pattern: if 16519 // A = < float a0, float a1, float a2, float a3 > 16520 // B = < float b0, float b1, float b2, float b3 > 16521 // and 16522 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 16523 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 16524 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 16525 // which is A horizontal-op B. 16526 16527 // At least one of the operands should be a vector shuffle. 16528 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 16529 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 16530 return false; 16531 16532 EVT VT = LHS.getValueType(); 16533 16534 assert((VT.is128BitVector() || VT.is256BitVector()) && 16535 "Unsupported vector type for horizontal add/sub"); 16536 16537 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 16538 // operate independently on 128-bit lanes. 16539 unsigned NumElts = VT.getVectorNumElements(); 16540 unsigned NumLanes = VT.getSizeInBits()/128; 16541 unsigned NumLaneElts = NumElts / NumLanes; 16542 assert((NumLaneElts % 2 == 0) && 16543 "Vector type should have an even number of elements in each lane"); 16544 unsigned HalfLaneElts = NumLaneElts/2; 16545 16546 // View LHS in the form 16547 // LHS = VECTOR_SHUFFLE A, B, LMask 16548 // If LHS is not a shuffle then pretend it is the shuffle 16549 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 16550 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 16551 // type VT. 16552 SDValue A, B; 16553 SmallVector<int, 16> LMask(NumElts); 16554 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 16555 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 16556 A = LHS.getOperand(0); 16557 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 16558 B = LHS.getOperand(1); 16559 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 16560 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 16561 } else { 16562 if (LHS.getOpcode() != ISD::UNDEF) 16563 A = LHS; 16564 for (unsigned i = 0; i != NumElts; ++i) 16565 LMask[i] = i; 16566 } 16567 16568 // Likewise, view RHS in the form 16569 // RHS = VECTOR_SHUFFLE C, D, RMask 16570 SDValue C, D; 16571 SmallVector<int, 16> RMask(NumElts); 16572 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 16573 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 16574 C = RHS.getOperand(0); 16575 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 16576 D = RHS.getOperand(1); 16577 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 16578 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 16579 } else { 16580 if (RHS.getOpcode() != ISD::UNDEF) 16581 C = RHS; 16582 for (unsigned i = 0; i != NumElts; ++i) 16583 RMask[i] = i; 16584 } 16585 16586 // Check that the shuffles are both shuffling the same vectors. 16587 if (!(A == C && B == D) && !(A == D && B == C)) 16588 return false; 16589 16590 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 16591 if (!A.getNode() && !B.getNode()) 16592 return false; 16593 16594 // If A and B occur in reverse order in RHS, then "swap" them (which means 16595 // rewriting the mask). 16596 if (A != C) 16597 CommuteVectorShuffleMask(RMask, NumElts); 16598 16599 // At this point LHS and RHS are equivalent to 16600 // LHS = VECTOR_SHUFFLE A, B, LMask 16601 // RHS = VECTOR_SHUFFLE A, B, RMask 16602 // Check that the masks correspond to performing a horizontal operation. 16603 for (unsigned i = 0; i != NumElts; ++i) { 16604 int LIdx = LMask[i], RIdx = RMask[i]; 16605 16606 // Ignore any UNDEF components. 16607 if (LIdx < 0 || RIdx < 0 || 16608 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 16609 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 16610 continue; 16611 16612 // Check that successive elements are being operated on. If not, this is 16613 // not a horizontal operation. 16614 unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs 16615 unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; 16616 int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; 16617 if (!(LIdx == Index && RIdx == Index + 1) && 16618 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 16619 return false; 16620 } 16621 16622 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 16623 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 16624 return true; 16625} 16626 16627/// PerformFADDCombine - Do target-specific dag combines on floating point adds. 16628static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 16629 const X86Subtarget *Subtarget) { 16630 EVT VT = N->getValueType(0); 16631 SDValue LHS = N->getOperand(0); 16632 SDValue RHS = N->getOperand(1); 16633 16634 // Try to synthesize horizontal adds from adds of shuffles. 16635 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 16636 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 16637 isHorizontalBinOp(LHS, RHS, true)) 16638 return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); 16639 return SDValue(); 16640} 16641 16642/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. 16643static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 16644 const X86Subtarget *Subtarget) { 16645 EVT VT = N->getValueType(0); 16646 SDValue LHS = N->getOperand(0); 16647 SDValue RHS = N->getOperand(1); 16648 16649 // Try to synthesize horizontal subs from subs of shuffles. 16650 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 16651 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 16652 isHorizontalBinOp(LHS, RHS, false)) 16653 return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); 16654 return SDValue(); 16655} 16656 16657/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 16658/// X86ISD::FXOR nodes. 16659static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 16660 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 16661 // F[X]OR(0.0, x) -> x 16662 // F[X]OR(x, 0.0) -> x 16663 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 16664 if (C->getValueAPF().isPosZero()) 16665 return N->getOperand(1); 16666 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 16667 if (C->getValueAPF().isPosZero()) 16668 return N->getOperand(0); 16669 return SDValue(); 16670} 16671 16672/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and 16673/// X86ISD::FMAX nodes. 16674static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { 16675 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); 16676 16677 // Only perform optimizations if UnsafeMath is used. 16678 if (!DAG.getTarget().Options.UnsafeFPMath) 16679 return SDValue(); 16680 16681 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes 16682 // into FMINC and FMAXC, which are Commutative operations. 16683 unsigned NewOp = 0; 16684 switch (N->getOpcode()) { 16685 default: llvm_unreachable("unknown opcode"); 16686 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; 16687 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; 16688 } 16689 16690 return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0), 16691 N->getOperand(0), N->getOperand(1)); 16692} 16693 16694/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 16695static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 16696 // FAND(0.0, x) -> 0.0 16697 // FAND(x, 0.0) -> 0.0 16698 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 16699 if (C->getValueAPF().isPosZero()) 16700 return N->getOperand(0); 16701 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 16702 if (C->getValueAPF().isPosZero()) 16703 return N->getOperand(1); 16704 return SDValue(); 16705} 16706 16707static SDValue PerformBTCombine(SDNode *N, 16708 SelectionDAG &DAG, 16709 TargetLowering::DAGCombinerInfo &DCI) { 16710 // BT ignores high bits in the bit index operand. 16711 SDValue Op1 = N->getOperand(1); 16712 if (Op1.hasOneUse()) { 16713 unsigned BitWidth = Op1.getValueSizeInBits(); 16714 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 16715 APInt KnownZero, KnownOne; 16716 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 16717 !DCI.isBeforeLegalizeOps()); 16718 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16719 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 16720 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 16721 DCI.CommitTargetLoweringOpt(TLO); 16722 } 16723 return SDValue(); 16724} 16725 16726static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 16727 SDValue Op = N->getOperand(0); 16728 if (Op.getOpcode() == ISD::BITCAST) 16729 Op = Op.getOperand(0); 16730 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 16731 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 16732 VT.getVectorElementType().getSizeInBits() == 16733 OpVT.getVectorElementType().getSizeInBits()) { 16734 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 16735 } 16736 return SDValue(); 16737} 16738 16739static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, 16740 TargetLowering::DAGCombinerInfo &DCI, 16741 const X86Subtarget *Subtarget) { 16742 if (!DCI.isBeforeLegalizeOps()) 16743 return SDValue(); 16744 16745 if (!Subtarget->hasFp256()) 16746 return SDValue(); 16747 16748 EVT VT = N->getValueType(0); 16749 SDValue Op = N->getOperand(0); 16750 EVT OpVT = Op.getValueType(); 16751 DebugLoc dl = N->getDebugLoc(); 16752 16753 if (VT.isVector() && VT.getSizeInBits() == 256) { 16754 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 16755 if (R.getNode()) 16756 return R; 16757 } 16758 16759 if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || 16760 (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { 16761 16762 if (Subtarget->hasInt256()) 16763 return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op); 16764 16765 // Optimize vectors in AVX mode 16766 // Sign extend v8i16 to v8i32 and 16767 // v4i32 to v4i64 16768 // 16769 // Divide input vector into two parts 16770 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 16771 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 16772 // concat the vectors to original VT 16773 16774 unsigned NumElems = OpVT.getVectorNumElements(); 16775 SDValue Undef = DAG.getUNDEF(OpVT); 16776 16777 SmallVector<int,8> ShufMask1(NumElems, -1); 16778 for (unsigned i = 0; i != NumElems/2; ++i) 16779 ShufMask1[i] = i; 16780 16781 SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]); 16782 16783 SmallVector<int,8> ShufMask2(NumElems, -1); 16784 for (unsigned i = 0; i != NumElems/2; ++i) 16785 ShufMask2[i] = i + NumElems/2; 16786 16787 SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]); 16788 16789 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 16790 VT.getVectorNumElements()/2); 16791 16792 OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 16793 OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); 16794 16795 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 16796 } 16797 return SDValue(); 16798} 16799 16800static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, 16801 const X86Subtarget* Subtarget) { 16802 DebugLoc dl = N->getDebugLoc(); 16803 EVT VT = N->getValueType(0); 16804 16805 // Let legalize expand this if it isn't a legal type yet. 16806 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 16807 return SDValue(); 16808 16809 EVT ScalarVT = VT.getScalarType(); 16810 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || 16811 (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) 16812 return SDValue(); 16813 16814 SDValue A = N->getOperand(0); 16815 SDValue B = N->getOperand(1); 16816 SDValue C = N->getOperand(2); 16817 16818 bool NegA = (A.getOpcode() == ISD::FNEG); 16819 bool NegB = (B.getOpcode() == ISD::FNEG); 16820 bool NegC = (C.getOpcode() == ISD::FNEG); 16821 16822 // Negative multiplication when NegA xor NegB 16823 bool NegMul = (NegA != NegB); 16824 if (NegA) 16825 A = A.getOperand(0); 16826 if (NegB) 16827 B = B.getOperand(0); 16828 if (NegC) 16829 C = C.getOperand(0); 16830 16831 unsigned Opcode; 16832 if (!NegMul) 16833 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; 16834 else 16835 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; 16836 16837 return DAG.getNode(Opcode, dl, VT, A, B, C); 16838} 16839 16840static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, 16841 TargetLowering::DAGCombinerInfo &DCI, 16842 const X86Subtarget *Subtarget) { 16843 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 16844 // (and (i32 x86isd::setcc_carry), 1) 16845 // This eliminates the zext. This transformation is necessary because 16846 // ISD::SETCC is always legalized to i8. 16847 DebugLoc dl = N->getDebugLoc(); 16848 SDValue N0 = N->getOperand(0); 16849 EVT VT = N->getValueType(0); 16850 EVT OpVT = N0.getValueType(); 16851 16852 if (N0.getOpcode() == ISD::AND && 16853 N0.hasOneUse() && 16854 N0.getOperand(0).hasOneUse()) { 16855 SDValue N00 = N0.getOperand(0); 16856 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 16857 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 16858 if (!C || C->getZExtValue() != 1) 16859 return SDValue(); 16860 return DAG.getNode(ISD::AND, dl, VT, 16861 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 16862 N00.getOperand(0), N00.getOperand(1)), 16863 DAG.getConstant(1, VT)); 16864 } 16865 } 16866 16867 if (VT.isVector() && VT.getSizeInBits() == 256) { 16868 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 16869 if (R.getNode()) 16870 return R; 16871 } 16872 16873 // Optimize vectors in AVX mode: 16874 // 16875 // v8i16 -> v8i32 16876 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 16877 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 16878 // Concat upper and lower parts. 16879 // 16880 // v4i32 -> v4i64 16881 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 16882 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 16883 // Concat upper and lower parts. 16884 // 16885 if (!DCI.isBeforeLegalizeOps()) 16886 return SDValue(); 16887 16888 if (!Subtarget->hasFp256()) 16889 return SDValue(); 16890 16891 if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || 16892 ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { 16893 16894 if (Subtarget->hasInt256()) 16895 return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0); 16896 16897 SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); 16898 SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec); 16899 SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec); 16900 16901 EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 16902 VT.getVectorNumElements()/2); 16903 16904 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); 16905 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); 16906 16907 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 16908 } 16909 16910 return SDValue(); 16911} 16912 16913// Optimize x == -y --> x+y == 0 16914// x != -y --> x+y != 0 16915static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { 16916 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 16917 SDValue LHS = N->getOperand(0); 16918 SDValue RHS = N->getOperand(1); 16919 16920 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) 16921 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) 16922 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { 16923 SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), 16924 LHS.getValueType(), RHS, LHS.getOperand(1)); 16925 return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), 16926 addV, DAG.getConstant(0, addV.getValueType()), CC); 16927 } 16928 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) 16929 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) 16930 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { 16931 SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), 16932 RHS.getValueType(), LHS, RHS.getOperand(1)); 16933 return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), 16934 addV, DAG.getConstant(0, addV.getValueType()), CC); 16935 } 16936 return SDValue(); 16937} 16938 16939// Helper function of PerformSETCCCombine. It is to materialize "setb reg" 16940// as "sbb reg,reg", since it can be extended without zext and produces 16941// an all-ones bit which is more useful than 0/1 in some cases. 16942static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { 16943 return DAG.getNode(ISD::AND, DL, MVT::i8, 16944 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 16945 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), 16946 DAG.getConstant(1, MVT::i8)); 16947} 16948 16949// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 16950static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, 16951 TargetLowering::DAGCombinerInfo &DCI, 16952 const X86Subtarget *Subtarget) { 16953 DebugLoc DL = N->getDebugLoc(); 16954 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); 16955 SDValue EFLAGS = N->getOperand(1); 16956 16957 if (CC == X86::COND_A) { 16958 // Try to convert COND_A into COND_B in an attempt to facilitate 16959 // materializing "setb reg". 16960 // 16961 // Do not flip "e > c", where "c" is a constant, because Cmp instruction 16962 // cannot take an immediate as its first operand. 16963 // 16964 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 16965 EFLAGS.getValueType().isInteger() && 16966 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { 16967 SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(), 16968 EFLAGS.getNode()->getVTList(), 16969 EFLAGS.getOperand(1), EFLAGS.getOperand(0)); 16970 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); 16971 return MaterializeSETB(DL, NewEFLAGS, DAG); 16972 } 16973 } 16974 16975 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 16976 // a zext and produces an all-ones bit which is more useful than 0/1 in some 16977 // cases. 16978 if (CC == X86::COND_B) 16979 return MaterializeSETB(DL, EFLAGS, DAG); 16980 16981 SDValue Flags; 16982 16983 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 16984 if (Flags.getNode()) { 16985 SDValue Cond = DAG.getConstant(CC, MVT::i8); 16986 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 16987 } 16988 16989 return SDValue(); 16990} 16991 16992// Optimize branch condition evaluation. 16993// 16994static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, 16995 TargetLowering::DAGCombinerInfo &DCI, 16996 const X86Subtarget *Subtarget) { 16997 DebugLoc DL = N->getDebugLoc(); 16998 SDValue Chain = N->getOperand(0); 16999 SDValue Dest = N->getOperand(1); 17000 SDValue EFLAGS = N->getOperand(3); 17001 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); 17002 17003 SDValue Flags; 17004 17005 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 17006 if (Flags.getNode()) { 17007 SDValue Cond = DAG.getConstant(CC, MVT::i8); 17008 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, 17009 Flags); 17010 } 17011 17012 return SDValue(); 17013} 17014 17015static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 17016 const X86TargetLowering *XTLI) { 17017 SDValue Op0 = N->getOperand(0); 17018 EVT InVT = Op0->getValueType(0); 17019 17020 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) 17021 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { 17022 DebugLoc dl = N->getDebugLoc(); 17023 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 17024 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); 17025 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); 17026 } 17027 17028 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 17029 // a 32-bit target where SSE doesn't support i64->FP operations. 17030 if (Op0.getOpcode() == ISD::LOAD) { 17031 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 17032 EVT VT = Ld->getValueType(0); 17033 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 17034 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 17035 !XTLI->getSubtarget()->is64Bit() && 17036 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 17037 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 17038 Ld->getChain(), Op0, DAG); 17039 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 17040 return FILDChain; 17041 } 17042 } 17043 return SDValue(); 17044} 17045 17046// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 17047static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 17048 X86TargetLowering::DAGCombinerInfo &DCI) { 17049 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 17050 // the result is either zero or one (depending on the input carry bit). 17051 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 17052 if (X86::isZeroNode(N->getOperand(0)) && 17053 X86::isZeroNode(N->getOperand(1)) && 17054 // We don't have a good way to replace an EFLAGS use, so only do this when 17055 // dead right now. 17056 SDValue(N, 1).use_empty()) { 17057 DebugLoc DL = N->getDebugLoc(); 17058 EVT VT = N->getValueType(0); 17059 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 17060 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 17061 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 17062 DAG.getConstant(X86::COND_B,MVT::i8), 17063 N->getOperand(2)), 17064 DAG.getConstant(1, VT)); 17065 return DCI.CombineTo(N, Res1, CarryOut); 17066 } 17067 17068 return SDValue(); 17069} 17070 17071// fold (add Y, (sete X, 0)) -> adc 0, Y 17072// (add Y, (setne X, 0)) -> sbb -1, Y 17073// (sub (sete X, 0), Y) -> sbb 0, Y 17074// (sub (setne X, 0), Y) -> adc -1, Y 17075static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 17076 DebugLoc DL = N->getDebugLoc(); 17077 17078 // Look through ZExts. 17079 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 17080 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 17081 return SDValue(); 17082 17083 SDValue SetCC = Ext.getOperand(0); 17084 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 17085 return SDValue(); 17086 17087 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 17088 if (CC != X86::COND_E && CC != X86::COND_NE) 17089 return SDValue(); 17090 17091 SDValue Cmp = SetCC.getOperand(1); 17092 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 17093 !X86::isZeroNode(Cmp.getOperand(1)) || 17094 !Cmp.getOperand(0).getValueType().isInteger()) 17095 return SDValue(); 17096 17097 SDValue CmpOp0 = Cmp.getOperand(0); 17098 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 17099 DAG.getConstant(1, CmpOp0.getValueType())); 17100 17101 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 17102 if (CC == X86::COND_NE) 17103 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 17104 DL, OtherVal.getValueType(), OtherVal, 17105 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 17106 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 17107 DL, OtherVal.getValueType(), OtherVal, 17108 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 17109} 17110 17111/// PerformADDCombine - Do target-specific dag combines on integer adds. 17112static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 17113 const X86Subtarget *Subtarget) { 17114 EVT VT = N->getValueType(0); 17115 SDValue Op0 = N->getOperand(0); 17116 SDValue Op1 = N->getOperand(1); 17117 17118 // Try to synthesize horizontal adds from adds of shuffles. 17119 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 17120 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 17121 isHorizontalBinOp(Op0, Op1, true)) 17122 return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); 17123 17124 return OptimizeConditionalInDecrement(N, DAG); 17125} 17126 17127static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 17128 const X86Subtarget *Subtarget) { 17129 SDValue Op0 = N->getOperand(0); 17130 SDValue Op1 = N->getOperand(1); 17131 17132 // X86 can't encode an immediate LHS of a sub. See if we can push the 17133 // negation into a preceding instruction. 17134 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 17135 // If the RHS of the sub is a XOR with one use and a constant, invert the 17136 // immediate. Then add one to the LHS of the sub so we can turn 17137 // X-Y -> X+~Y+1, saving one register. 17138 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 17139 isa<ConstantSDNode>(Op1.getOperand(1))) { 17140 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 17141 EVT VT = Op0.getValueType(); 17142 SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, 17143 Op1.getOperand(0), 17144 DAG.getConstant(~XorC, VT)); 17145 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, 17146 DAG.getConstant(C->getAPIntValue()+1, VT)); 17147 } 17148 } 17149 17150 // Try to synthesize horizontal adds from adds of shuffles. 17151 EVT VT = N->getValueType(0); 17152 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 17153 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 17154 isHorizontalBinOp(Op0, Op1, true)) 17155 return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); 17156 17157 return OptimizeConditionalInDecrement(N, DAG); 17158} 17159 17160/// performVZEXTCombine - Performs build vector combines 17161static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, 17162 TargetLowering::DAGCombinerInfo &DCI, 17163 const X86Subtarget *Subtarget) { 17164 // (vzext (bitcast (vzext (x)) -> (vzext x) 17165 SDValue In = N->getOperand(0); 17166 while (In.getOpcode() == ISD::BITCAST) 17167 In = In.getOperand(0); 17168 17169 if (In.getOpcode() != X86ISD::VZEXT) 17170 return SDValue(); 17171 17172 return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0)); 17173} 17174 17175SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 17176 DAGCombinerInfo &DCI) const { 17177 SelectionDAG &DAG = DCI.DAG; 17178 switch (N->getOpcode()) { 17179 default: break; 17180 case ISD::EXTRACT_VECTOR_ELT: 17181 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); 17182 case ISD::VSELECT: 17183 case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); 17184 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); 17185 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 17186 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 17187 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 17188 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 17189 case ISD::SHL: 17190 case ISD::SRA: 17191 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); 17192 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 17193 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 17194 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 17195 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); 17196 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 17197 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 17198 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 17199 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 17200 case X86ISD::FXOR: 17201 case X86ISD::FOR: return PerformFORCombine(N, DAG); 17202 case X86ISD::FMIN: 17203 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); 17204 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 17205 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 17206 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 17207 case ISD::ANY_EXTEND: 17208 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); 17209 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); 17210 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); 17211 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); 17212 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); 17213 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); 17214 case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); 17215 case X86ISD::SHUFP: // Handle all target specific shuffles 17216 case X86ISD::PALIGN: 17217 case X86ISD::UNPCKH: 17218 case X86ISD::UNPCKL: 17219 case X86ISD::MOVHLPS: 17220 case X86ISD::MOVLHPS: 17221 case X86ISD::PSHUFD: 17222 case X86ISD::PSHUFHW: 17223 case X86ISD::PSHUFLW: 17224 case X86ISD::MOVSS: 17225 case X86ISD::MOVSD: 17226 case X86ISD::VPERMILP: 17227 case X86ISD::VPERM2X128: 17228 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 17229 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); 17230 } 17231 17232 return SDValue(); 17233} 17234 17235/// isTypeDesirableForOp - Return true if the target has native support for 17236/// the specified value type and it is 'desirable' to use the type for the 17237/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 17238/// instruction encodings are longer and some i16 instructions are slow. 17239bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 17240 if (!isTypeLegal(VT)) 17241 return false; 17242 if (VT != MVT::i16) 17243 return true; 17244 17245 switch (Opc) { 17246 default: 17247 return true; 17248 case ISD::LOAD: 17249 case ISD::SIGN_EXTEND: 17250 case ISD::ZERO_EXTEND: 17251 case ISD::ANY_EXTEND: 17252 case ISD::SHL: 17253 case ISD::SRL: 17254 case ISD::SUB: 17255 case ISD::ADD: 17256 case ISD::MUL: 17257 case ISD::AND: 17258 case ISD::OR: 17259 case ISD::XOR: 17260 return false; 17261 } 17262} 17263 17264/// IsDesirableToPromoteOp - This method query the target whether it is 17265/// beneficial for dag combiner to promote the specified node. If true, it 17266/// should return the desired promotion type by reference. 17267bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 17268 EVT VT = Op.getValueType(); 17269 if (VT != MVT::i16) 17270 return false; 17271 17272 bool Promote = false; 17273 bool Commute = false; 17274 switch (Op.getOpcode()) { 17275 default: break; 17276 case ISD::LOAD: { 17277 LoadSDNode *LD = cast<LoadSDNode>(Op); 17278 // If the non-extending load has a single use and it's not live out, then it 17279 // might be folded. 17280 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 17281 Op.hasOneUse()*/) { 17282 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 17283 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 17284 // The only case where we'd want to promote LOAD (rather then it being 17285 // promoted as an operand is when it's only use is liveout. 17286 if (UI->getOpcode() != ISD::CopyToReg) 17287 return false; 17288 } 17289 } 17290 Promote = true; 17291 break; 17292 } 17293 case ISD::SIGN_EXTEND: 17294 case ISD::ZERO_EXTEND: 17295 case ISD::ANY_EXTEND: 17296 Promote = true; 17297 break; 17298 case ISD::SHL: 17299 case ISD::SRL: { 17300 SDValue N0 = Op.getOperand(0); 17301 // Look out for (store (shl (load), x)). 17302 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 17303 return false; 17304 Promote = true; 17305 break; 17306 } 17307 case ISD::ADD: 17308 case ISD::MUL: 17309 case ISD::AND: 17310 case ISD::OR: 17311 case ISD::XOR: 17312 Commute = true; 17313 // fallthrough 17314 case ISD::SUB: { 17315 SDValue N0 = Op.getOperand(0); 17316 SDValue N1 = Op.getOperand(1); 17317 if (!Commute && MayFoldLoad(N1)) 17318 return false; 17319 // Avoid disabling potential load folding opportunities. 17320 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 17321 return false; 17322 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 17323 return false; 17324 Promote = true; 17325 } 17326 } 17327 17328 PVT = MVT::i32; 17329 return Promote; 17330} 17331 17332//===----------------------------------------------------------------------===// 17333// X86 Inline Assembly Support 17334//===----------------------------------------------------------------------===// 17335 17336namespace { 17337 // Helper to match a string separated by whitespace. 17338 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { 17339 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. 17340 17341 for (unsigned i = 0, e = args.size(); i != e; ++i) { 17342 StringRef piece(*args[i]); 17343 if (!s.startswith(piece)) // Check if the piece matches. 17344 return false; 17345 17346 s = s.substr(piece.size()); 17347 StringRef::size_type pos = s.find_first_not_of(" \t"); 17348 if (pos == 0) // We matched a prefix. 17349 return false; 17350 17351 s = s.substr(pos); 17352 } 17353 17354 return s.empty(); 17355 } 17356 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; 17357} 17358 17359bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 17360 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 17361 17362 std::string AsmStr = IA->getAsmString(); 17363 17364 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 17365 if (!Ty || Ty->getBitWidth() % 16 != 0) 17366 return false; 17367 17368 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 17369 SmallVector<StringRef, 4> AsmPieces; 17370 SplitString(AsmStr, AsmPieces, ";\n"); 17371 17372 switch (AsmPieces.size()) { 17373 default: return false; 17374 case 1: 17375 // FIXME: this should verify that we are targeting a 486 or better. If not, 17376 // we will turn this bswap into something that will be lowered to logical 17377 // ops instead of emitting the bswap asm. For now, we don't support 486 or 17378 // lower so don't worry about this. 17379 // bswap $0 17380 if (matchAsm(AsmPieces[0], "bswap", "$0") || 17381 matchAsm(AsmPieces[0], "bswapl", "$0") || 17382 matchAsm(AsmPieces[0], "bswapq", "$0") || 17383 matchAsm(AsmPieces[0], "bswap", "${0:q}") || 17384 matchAsm(AsmPieces[0], "bswapl", "${0:q}") || 17385 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { 17386 // No need to check constraints, nothing other than the equivalent of 17387 // "=r,0" would be valid here. 17388 return IntrinsicLowering::LowerToByteSwap(CI); 17389 } 17390 17391 // rorw $$8, ${0:w} --> llvm.bswap.i16 17392 if (CI->getType()->isIntegerTy(16) && 17393 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 17394 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || 17395 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { 17396 AsmPieces.clear(); 17397 const std::string &ConstraintsStr = IA->getConstraintString(); 17398 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 17399 std::sort(AsmPieces.begin(), AsmPieces.end()); 17400 if (AsmPieces.size() == 4 && 17401 AsmPieces[0] == "~{cc}" && 17402 AsmPieces[1] == "~{dirflag}" && 17403 AsmPieces[2] == "~{flags}" && 17404 AsmPieces[3] == "~{fpsr}") 17405 return IntrinsicLowering::LowerToByteSwap(CI); 17406 } 17407 break; 17408 case 3: 17409 if (CI->getType()->isIntegerTy(32) && 17410 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 17411 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && 17412 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && 17413 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { 17414 AsmPieces.clear(); 17415 const std::string &ConstraintsStr = IA->getConstraintString(); 17416 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 17417 std::sort(AsmPieces.begin(), AsmPieces.end()); 17418 if (AsmPieces.size() == 4 && 17419 AsmPieces[0] == "~{cc}" && 17420 AsmPieces[1] == "~{dirflag}" && 17421 AsmPieces[2] == "~{flags}" && 17422 AsmPieces[3] == "~{fpsr}") 17423 return IntrinsicLowering::LowerToByteSwap(CI); 17424 } 17425 17426 if (CI->getType()->isIntegerTy(64)) { 17427 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 17428 if (Constraints.size() >= 2 && 17429 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 17430 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 17431 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 17432 if (matchAsm(AsmPieces[0], "bswap", "%eax") && 17433 matchAsm(AsmPieces[1], "bswap", "%edx") && 17434 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) 17435 return IntrinsicLowering::LowerToByteSwap(CI); 17436 } 17437 } 17438 break; 17439 } 17440 return false; 17441} 17442 17443/// getConstraintType - Given a constraint letter, return the type of 17444/// constraint it is for this target. 17445X86TargetLowering::ConstraintType 17446X86TargetLowering::getConstraintType(const std::string &Constraint) const { 17447 if (Constraint.size() == 1) { 17448 switch (Constraint[0]) { 17449 case 'R': 17450 case 'q': 17451 case 'Q': 17452 case 'f': 17453 case 't': 17454 case 'u': 17455 case 'y': 17456 case 'x': 17457 case 'Y': 17458 case 'l': 17459 return C_RegisterClass; 17460 case 'a': 17461 case 'b': 17462 case 'c': 17463 case 'd': 17464 case 'S': 17465 case 'D': 17466 case 'A': 17467 return C_Register; 17468 case 'I': 17469 case 'J': 17470 case 'K': 17471 case 'L': 17472 case 'M': 17473 case 'N': 17474 case 'G': 17475 case 'C': 17476 case 'e': 17477 case 'Z': 17478 return C_Other; 17479 default: 17480 break; 17481 } 17482 } 17483 return TargetLowering::getConstraintType(Constraint); 17484} 17485 17486/// Examine constraint type and operand type and determine a weight value. 17487/// This object must already have been set up with the operand type 17488/// and the current alternative constraint selected. 17489TargetLowering::ConstraintWeight 17490 X86TargetLowering::getSingleConstraintMatchWeight( 17491 AsmOperandInfo &info, const char *constraint) const { 17492 ConstraintWeight weight = CW_Invalid; 17493 Value *CallOperandVal = info.CallOperandVal; 17494 // If we don't have a value, we can't do a match, 17495 // but allow it at the lowest weight. 17496 if (CallOperandVal == NULL) 17497 return CW_Default; 17498 Type *type = CallOperandVal->getType(); 17499 // Look at the constraint type. 17500 switch (*constraint) { 17501 default: 17502 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 17503 case 'R': 17504 case 'q': 17505 case 'Q': 17506 case 'a': 17507 case 'b': 17508 case 'c': 17509 case 'd': 17510 case 'S': 17511 case 'D': 17512 case 'A': 17513 if (CallOperandVal->getType()->isIntegerTy()) 17514 weight = CW_SpecificReg; 17515 break; 17516 case 'f': 17517 case 't': 17518 case 'u': 17519 if (type->isFloatingPointTy()) 17520 weight = CW_SpecificReg; 17521 break; 17522 case 'y': 17523 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 17524 weight = CW_SpecificReg; 17525 break; 17526 case 'x': 17527 case 'Y': 17528 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || 17529 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) 17530 weight = CW_Register; 17531 break; 17532 case 'I': 17533 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 17534 if (C->getZExtValue() <= 31) 17535 weight = CW_Constant; 17536 } 17537 break; 17538 case 'J': 17539 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 17540 if (C->getZExtValue() <= 63) 17541 weight = CW_Constant; 17542 } 17543 break; 17544 case 'K': 17545 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 17546 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 17547 weight = CW_Constant; 17548 } 17549 break; 17550 case 'L': 17551 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 17552 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 17553 weight = CW_Constant; 17554 } 17555 break; 17556 case 'M': 17557 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 17558 if (C->getZExtValue() <= 3) 17559 weight = CW_Constant; 17560 } 17561 break; 17562 case 'N': 17563 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 17564 if (C->getZExtValue() <= 0xff) 17565 weight = CW_Constant; 17566 } 17567 break; 17568 case 'G': 17569 case 'C': 17570 if (dyn_cast<ConstantFP>(CallOperandVal)) { 17571 weight = CW_Constant; 17572 } 17573 break; 17574 case 'e': 17575 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 17576 if ((C->getSExtValue() >= -0x80000000LL) && 17577 (C->getSExtValue() <= 0x7fffffffLL)) 17578 weight = CW_Constant; 17579 } 17580 break; 17581 case 'Z': 17582 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 17583 if (C->getZExtValue() <= 0xffffffff) 17584 weight = CW_Constant; 17585 } 17586 break; 17587 } 17588 return weight; 17589} 17590 17591/// LowerXConstraint - try to replace an X constraint, which matches anything, 17592/// with another that has more specific requirements based on the type of the 17593/// corresponding operand. 17594const char *X86TargetLowering:: 17595LowerXConstraint(EVT ConstraintVT) const { 17596 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 17597 // 'f' like normal targets. 17598 if (ConstraintVT.isFloatingPoint()) { 17599 if (Subtarget->hasSSE2()) 17600 return "Y"; 17601 if (Subtarget->hasSSE1()) 17602 return "x"; 17603 } 17604 17605 return TargetLowering::LowerXConstraint(ConstraintVT); 17606} 17607 17608/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 17609/// vector. If it is invalid, don't add anything to Ops. 17610void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 17611 std::string &Constraint, 17612 std::vector<SDValue>&Ops, 17613 SelectionDAG &DAG) const { 17614 SDValue Result(0, 0); 17615 17616 // Only support length 1 constraints for now. 17617 if (Constraint.length() > 1) return; 17618 17619 char ConstraintLetter = Constraint[0]; 17620 switch (ConstraintLetter) { 17621 default: break; 17622 case 'I': 17623 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 17624 if (C->getZExtValue() <= 31) { 17625 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 17626 break; 17627 } 17628 } 17629 return; 17630 case 'J': 17631 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 17632 if (C->getZExtValue() <= 63) { 17633 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 17634 break; 17635 } 17636 } 17637 return; 17638 case 'K': 17639 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 17640 if (isInt<8>(C->getSExtValue())) { 17641 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 17642 break; 17643 } 17644 } 17645 return; 17646 case 'N': 17647 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 17648 if (C->getZExtValue() <= 255) { 17649 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 17650 break; 17651 } 17652 } 17653 return; 17654 case 'e': { 17655 // 32-bit signed value 17656 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 17657 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 17658 C->getSExtValue())) { 17659 // Widen to 64 bits here to get it sign extended. 17660 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 17661 break; 17662 } 17663 // FIXME gcc accepts some relocatable values here too, but only in certain 17664 // memory models; it's complicated. 17665 } 17666 return; 17667 } 17668 case 'Z': { 17669 // 32-bit unsigned value 17670 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 17671 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 17672 C->getZExtValue())) { 17673 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 17674 break; 17675 } 17676 } 17677 // FIXME gcc accepts some relocatable values here too, but only in certain 17678 // memory models; it's complicated. 17679 return; 17680 } 17681 case 'i': { 17682 // Literal immediates are always ok. 17683 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 17684 // Widen to 64 bits here to get it sign extended. 17685 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 17686 break; 17687 } 17688 17689 // In any sort of PIC mode addresses need to be computed at runtime by 17690 // adding in a register or some sort of table lookup. These can't 17691 // be used as immediates. 17692 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 17693 return; 17694 17695 // If we are in non-pic codegen mode, we allow the address of a global (with 17696 // an optional displacement) to be used with 'i'. 17697 GlobalAddressSDNode *GA = 0; 17698 int64_t Offset = 0; 17699 17700 // Match either (GA), (GA+C), (GA+C1+C2), etc. 17701 while (1) { 17702 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 17703 Offset += GA->getOffset(); 17704 break; 17705 } else if (Op.getOpcode() == ISD::ADD) { 17706 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 17707 Offset += C->getZExtValue(); 17708 Op = Op.getOperand(0); 17709 continue; 17710 } 17711 } else if (Op.getOpcode() == ISD::SUB) { 17712 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 17713 Offset += -C->getZExtValue(); 17714 Op = Op.getOperand(0); 17715 continue; 17716 } 17717 } 17718 17719 // Otherwise, this isn't something we can handle, reject it. 17720 return; 17721 } 17722 17723 const GlobalValue *GV = GA->getGlobal(); 17724 // If we require an extra load to get this address, as in PIC mode, we 17725 // can't accept it. 17726 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 17727 getTargetMachine()))) 17728 return; 17729 17730 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 17731 GA->getValueType(0), Offset); 17732 break; 17733 } 17734 } 17735 17736 if (Result.getNode()) { 17737 Ops.push_back(Result); 17738 return; 17739 } 17740 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 17741} 17742 17743std::pair<unsigned, const TargetRegisterClass*> 17744X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 17745 EVT VT) const { 17746 // First, see if this is a constraint that directly corresponds to an LLVM 17747 // register class. 17748 if (Constraint.size() == 1) { 17749 // GCC Constraint Letters 17750 switch (Constraint[0]) { 17751 default: break; 17752 // TODO: Slight differences here in allocation order and leaving 17753 // RIP in the class. Do they matter any more here than they do 17754 // in the normal allocation? 17755 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 17756 if (Subtarget->is64Bit()) { 17757 if (VT == MVT::i32 || VT == MVT::f32) 17758 return std::make_pair(0U, &X86::GR32RegClass); 17759 if (VT == MVT::i16) 17760 return std::make_pair(0U, &X86::GR16RegClass); 17761 if (VT == MVT::i8 || VT == MVT::i1) 17762 return std::make_pair(0U, &X86::GR8RegClass); 17763 if (VT == MVT::i64 || VT == MVT::f64) 17764 return std::make_pair(0U, &X86::GR64RegClass); 17765 break; 17766 } 17767 // 32-bit fallthrough 17768 case 'Q': // Q_REGS 17769 if (VT == MVT::i32 || VT == MVT::f32) 17770 return std::make_pair(0U, &X86::GR32_ABCDRegClass); 17771 if (VT == MVT::i16) 17772 return std::make_pair(0U, &X86::GR16_ABCDRegClass); 17773 if (VT == MVT::i8 || VT == MVT::i1) 17774 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); 17775 if (VT == MVT::i64) 17776 return std::make_pair(0U, &X86::GR64_ABCDRegClass); 17777 break; 17778 case 'r': // GENERAL_REGS 17779 case 'l': // INDEX_REGS 17780 if (VT == MVT::i8 || VT == MVT::i1) 17781 return std::make_pair(0U, &X86::GR8RegClass); 17782 if (VT == MVT::i16) 17783 return std::make_pair(0U, &X86::GR16RegClass); 17784 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 17785 return std::make_pair(0U, &X86::GR32RegClass); 17786 return std::make_pair(0U, &X86::GR64RegClass); 17787 case 'R': // LEGACY_REGS 17788 if (VT == MVT::i8 || VT == MVT::i1) 17789 return std::make_pair(0U, &X86::GR8_NOREXRegClass); 17790 if (VT == MVT::i16) 17791 return std::make_pair(0U, &X86::GR16_NOREXRegClass); 17792 if (VT == MVT::i32 || !Subtarget->is64Bit()) 17793 return std::make_pair(0U, &X86::GR32_NOREXRegClass); 17794 return std::make_pair(0U, &X86::GR64_NOREXRegClass); 17795 case 'f': // FP Stack registers. 17796 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 17797 // value to the correct fpstack register class. 17798 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 17799 return std::make_pair(0U, &X86::RFP32RegClass); 17800 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 17801 return std::make_pair(0U, &X86::RFP64RegClass); 17802 return std::make_pair(0U, &X86::RFP80RegClass); 17803 case 'y': // MMX_REGS if MMX allowed. 17804 if (!Subtarget->hasMMX()) break; 17805 return std::make_pair(0U, &X86::VR64RegClass); 17806 case 'Y': // SSE_REGS if SSE2 allowed 17807 if (!Subtarget->hasSSE2()) break; 17808 // FALL THROUGH. 17809 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 17810 if (!Subtarget->hasSSE1()) break; 17811 17812 switch (VT.getSimpleVT().SimpleTy) { 17813 default: break; 17814 // Scalar SSE types. 17815 case MVT::f32: 17816 case MVT::i32: 17817 return std::make_pair(0U, &X86::FR32RegClass); 17818 case MVT::f64: 17819 case MVT::i64: 17820 return std::make_pair(0U, &X86::FR64RegClass); 17821 // Vector types. 17822 case MVT::v16i8: 17823 case MVT::v8i16: 17824 case MVT::v4i32: 17825 case MVT::v2i64: 17826 case MVT::v4f32: 17827 case MVT::v2f64: 17828 return std::make_pair(0U, &X86::VR128RegClass); 17829 // AVX types. 17830 case MVT::v32i8: 17831 case MVT::v16i16: 17832 case MVT::v8i32: 17833 case MVT::v4i64: 17834 case MVT::v8f32: 17835 case MVT::v4f64: 17836 return std::make_pair(0U, &X86::VR256RegClass); 17837 } 17838 break; 17839 } 17840 } 17841 17842 // Use the default implementation in TargetLowering to convert the register 17843 // constraint into a member of a register class. 17844 std::pair<unsigned, const TargetRegisterClass*> Res; 17845 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 17846 17847 // Not found as a standard register? 17848 if (Res.second == 0) { 17849 // Map st(0) -> st(7) -> ST0 17850 if (Constraint.size() == 7 && Constraint[0] == '{' && 17851 tolower(Constraint[1]) == 's' && 17852 tolower(Constraint[2]) == 't' && 17853 Constraint[3] == '(' && 17854 (Constraint[4] >= '0' && Constraint[4] <= '7') && 17855 Constraint[5] == ')' && 17856 Constraint[6] == '}') { 17857 17858 Res.first = X86::ST0+Constraint[4]-'0'; 17859 Res.second = &X86::RFP80RegClass; 17860 return Res; 17861 } 17862 17863 // GCC allows "st(0)" to be called just plain "st". 17864 if (StringRef("{st}").equals_lower(Constraint)) { 17865 Res.first = X86::ST0; 17866 Res.second = &X86::RFP80RegClass; 17867 return Res; 17868 } 17869 17870 // flags -> EFLAGS 17871 if (StringRef("{flags}").equals_lower(Constraint)) { 17872 Res.first = X86::EFLAGS; 17873 Res.second = &X86::CCRRegClass; 17874 return Res; 17875 } 17876 17877 // 'A' means EAX + EDX. 17878 if (Constraint == "A") { 17879 Res.first = X86::EAX; 17880 Res.second = &X86::GR32_ADRegClass; 17881 return Res; 17882 } 17883 return Res; 17884 } 17885 17886 // Otherwise, check to see if this is a register class of the wrong value 17887 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 17888 // turn into {ax},{dx}. 17889 if (Res.second->hasType(VT)) 17890 return Res; // Correct type already, nothing to do. 17891 17892 // All of the single-register GCC register classes map their values onto 17893 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 17894 // really want an 8-bit or 32-bit register, map to the appropriate register 17895 // class and return the appropriate register. 17896 if (Res.second == &X86::GR16RegClass) { 17897 if (VT == MVT::i8) { 17898 unsigned DestReg = 0; 17899 switch (Res.first) { 17900 default: break; 17901 case X86::AX: DestReg = X86::AL; break; 17902 case X86::DX: DestReg = X86::DL; break; 17903 case X86::CX: DestReg = X86::CL; break; 17904 case X86::BX: DestReg = X86::BL; break; 17905 } 17906 if (DestReg) { 17907 Res.first = DestReg; 17908 Res.second = &X86::GR8RegClass; 17909 } 17910 } else if (VT == MVT::i32) { 17911 unsigned DestReg = 0; 17912 switch (Res.first) { 17913 default: break; 17914 case X86::AX: DestReg = X86::EAX; break; 17915 case X86::DX: DestReg = X86::EDX; break; 17916 case X86::CX: DestReg = X86::ECX; break; 17917 case X86::BX: DestReg = X86::EBX; break; 17918 case X86::SI: DestReg = X86::ESI; break; 17919 case X86::DI: DestReg = X86::EDI; break; 17920 case X86::BP: DestReg = X86::EBP; break; 17921 case X86::SP: DestReg = X86::ESP; break; 17922 } 17923 if (DestReg) { 17924 Res.first = DestReg; 17925 Res.second = &X86::GR32RegClass; 17926 } 17927 } else if (VT == MVT::i64) { 17928 unsigned DestReg = 0; 17929 switch (Res.first) { 17930 default: break; 17931 case X86::AX: DestReg = X86::RAX; break; 17932 case X86::DX: DestReg = X86::RDX; break; 17933 case X86::CX: DestReg = X86::RCX; break; 17934 case X86::BX: DestReg = X86::RBX; break; 17935 case X86::SI: DestReg = X86::RSI; break; 17936 case X86::DI: DestReg = X86::RDI; break; 17937 case X86::BP: DestReg = X86::RBP; break; 17938 case X86::SP: DestReg = X86::RSP; break; 17939 } 17940 if (DestReg) { 17941 Res.first = DestReg; 17942 Res.second = &X86::GR64RegClass; 17943 } 17944 } 17945 } else if (Res.second == &X86::FR32RegClass || 17946 Res.second == &X86::FR64RegClass || 17947 Res.second == &X86::VR128RegClass) { 17948 // Handle references to XMM physical registers that got mapped into the 17949 // wrong class. This can happen with constraints like {xmm0} where the 17950 // target independent register mapper will just pick the first match it can 17951 // find, ignoring the required type. 17952 17953 if (VT == MVT::f32 || VT == MVT::i32) 17954 Res.second = &X86::FR32RegClass; 17955 else if (VT == MVT::f64 || VT == MVT::i64) 17956 Res.second = &X86::FR64RegClass; 17957 else if (X86::VR128RegClass.hasType(VT)) 17958 Res.second = &X86::VR128RegClass; 17959 else if (X86::VR256RegClass.hasType(VT)) 17960 Res.second = &X86::VR256RegClass; 17961 } 17962 17963 return Res; 17964} 17965 17966//===----------------------------------------------------------------------===// 17967// 17968// X86 cost model. 17969// 17970//===----------------------------------------------------------------------===// 17971 17972struct X86CostTblEntry { 17973 int ISD; 17974 MVT Type; 17975 unsigned Cost; 17976}; 17977 17978static int 17979FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) { 17980 for (unsigned int i = 0; i < len; ++i) 17981 if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty) 17982 return i; 17983 17984 // Could not find an entry. 17985 return -1; 17986} 17987 17988struct X86TypeConversionCostTblEntry { 17989 int ISD; 17990 MVT Dst; 17991 MVT Src; 17992 unsigned Cost; 17993}; 17994 17995static int 17996FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len, 17997 int ISD, MVT Dst, MVT Src) { 17998 for (unsigned int i = 0; i < len; ++i) 17999 if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst) 18000 return i; 18001 18002 // Could not find an entry. 18003 return -1; 18004} 18005 18006ScalarTargetTransformInfo::PopcntHwSupport 18007X86ScalarTargetTransformImpl::getPopcntHwSupport(unsigned TyWidth) const { 18008 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 18009 const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>(); 18010 18011 // TODO: Currently the __builtin_popcount() implementation using SSE3 18012 // instructions is inefficient. Once the problem is fixed, we should 18013 // call ST.hasSSE3() instead of ST.hasSSE4(). 18014 return ST.hasSSE41() ? Fast : None; 18015} 18016 18017unsigned 18018X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode, 18019 Type *Ty) const { 18020 // Legalize the type. 18021 std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty); 18022 18023 int ISD = InstructionOpcodeToISD(Opcode); 18024 assert(ISD && "Invalid opcode"); 18025 18026 const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>(); 18027 18028 static const X86CostTblEntry AVX1CostTable[] = { 18029 // We don't have to scalarize unsupported ops. We can issue two half-sized 18030 // operations and we only need to extract the upper YMM half. 18031 // Two ops + 1 extract + 1 insert = 4. 18032 { ISD::MUL, MVT::v8i32, 4 }, 18033 { ISD::SUB, MVT::v8i32, 4 }, 18034 { ISD::ADD, MVT::v8i32, 4 }, 18035 { ISD::MUL, MVT::v4i64, 4 }, 18036 { ISD::SUB, MVT::v4i64, 4 }, 18037 { ISD::ADD, MVT::v4i64, 4 }, 18038 }; 18039 18040 // Look for AVX1 lowering tricks. 18041 if (ST.hasAVX()) { 18042 int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD, 18043 LT.second); 18044 if (Idx != -1) 18045 return LT.first * AVX1CostTable[Idx].Cost; 18046 } 18047 // Fallback to the default implementation. 18048 return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty); 18049} 18050 18051unsigned 18052X86VectorTargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src, 18053 unsigned Alignment, 18054 unsigned AddressSpace) const { 18055 // Legalize the type. 18056 std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Src); 18057 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 18058 "Invalid Opcode"); 18059 18060 const X86Subtarget &ST = 18061 TLI->getTargetMachine().getSubtarget<X86Subtarget>(); 18062 18063 // Each load/store unit costs 1. 18064 unsigned Cost = LT.first * 1; 18065 18066 // On Sandybridge 256bit load/stores are double pumped 18067 // (but not on Haswell). 18068 if (LT.second.getSizeInBits() > 128 && !ST.hasAVX2()) 18069 Cost*=2; 18070 18071 return Cost; 18072} 18073 18074unsigned 18075X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val, 18076 unsigned Index) const { 18077 assert(Val->isVectorTy() && "This must be a vector type"); 18078 18079 if (Index != -1U) { 18080 // Legalize the type. 18081 std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Val); 18082 18083 // This type is legalized to a scalar type. 18084 if (!LT.second.isVector()) 18085 return 0; 18086 18087 // The type may be split. Normalize the index to the new type. 18088 unsigned Width = LT.second.getVectorNumElements(); 18089 Index = Index % Width; 18090 18091 // Floating point scalars are already located in index #0. 18092 if (Val->getScalarType()->isFloatingPointTy() && Index == 0) 18093 return 0; 18094 } 18095 18096 return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index); 18097} 18098 18099unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, 18100 Type *ValTy, 18101 Type *CondTy) const { 18102 // Legalize the type. 18103 std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy); 18104 18105 MVT MTy = LT.second; 18106 18107 int ISD = InstructionOpcodeToISD(Opcode); 18108 assert(ISD && "Invalid opcode"); 18109 18110 const X86Subtarget &ST = 18111 TLI->getTargetMachine().getSubtarget<X86Subtarget>(); 18112 18113 static const X86CostTblEntry SSE42CostTbl[] = { 18114 { ISD::SETCC, MVT::v2f64, 1 }, 18115 { ISD::SETCC, MVT::v4f32, 1 }, 18116 { ISD::SETCC, MVT::v2i64, 1 }, 18117 { ISD::SETCC, MVT::v4i32, 1 }, 18118 { ISD::SETCC, MVT::v8i16, 1 }, 18119 { ISD::SETCC, MVT::v16i8, 1 }, 18120 }; 18121 18122 static const X86CostTblEntry AVX1CostTbl[] = { 18123 { ISD::SETCC, MVT::v4f64, 1 }, 18124 { ISD::SETCC, MVT::v8f32, 1 }, 18125 // AVX1 does not support 8-wide integer compare. 18126 { ISD::SETCC, MVT::v4i64, 4 }, 18127 { ISD::SETCC, MVT::v8i32, 4 }, 18128 { ISD::SETCC, MVT::v16i16, 4 }, 18129 { ISD::SETCC, MVT::v32i8, 4 }, 18130 }; 18131 18132 static const X86CostTblEntry AVX2CostTbl[] = { 18133 { ISD::SETCC, MVT::v4i64, 1 }, 18134 { ISD::SETCC, MVT::v8i32, 1 }, 18135 { ISD::SETCC, MVT::v16i16, 1 }, 18136 { ISD::SETCC, MVT::v32i8, 1 }, 18137 }; 18138 18139 if (ST.hasAVX2()) { 18140 int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy); 18141 if (Idx != -1) 18142 return LT.first * AVX2CostTbl[Idx].Cost; 18143 } 18144 18145 if (ST.hasAVX()) { 18146 int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy); 18147 if (Idx != -1) 18148 return LT.first * AVX1CostTbl[Idx].Cost; 18149 } 18150 18151 if (ST.hasSSE42()) { 18152 int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy); 18153 if (Idx != -1) 18154 return LT.first * SSE42CostTbl[Idx].Cost; 18155 } 18156 18157 return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy); 18158} 18159 18160unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode, 18161 Type *Dst, 18162 Type *Src) const { 18163 int ISD = InstructionOpcodeToISD(Opcode); 18164 assert(ISD && "Invalid opcode"); 18165 18166 EVT SrcTy = TLI->getValueType(Src); 18167 EVT DstTy = TLI->getValueType(Dst); 18168 18169 if (!SrcTy.isSimple() || !DstTy.isSimple()) 18170 return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src); 18171 18172 const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>(); 18173 18174 static const X86TypeConversionCostTblEntry AVXConversionTbl[] = { 18175 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 18176 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 18177 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 18178 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 18179 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, 18180 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 }, 18181 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, 18182 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, 18183 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, 18184 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, 18185 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 }, 18186 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 18187 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 }, 18188 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 }, 18189 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, 18190 }; 18191 18192 if (ST.hasAVX()) { 18193 int Idx = FindInConvertTable(AVXConversionTbl, 18194 array_lengthof(AVXConversionTbl), 18195 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); 18196 if (Idx != -1) 18197 return AVXConversionTbl[Idx].Cost; 18198 } 18199 18200 return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src); 18201} 18202 18203