X86ISelLowering.cpp revision fd49821c3598e254735e7d08469fb7e9905498c6
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86ISelLowering.h" 17#include "X86.h" 18#include "X86InstrBuilder.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/MC/MCAsmInfo.h" 39#include "llvm/MC/MCContext.h" 40#include "llvm/MC/MCExpr.h" 41#include "llvm/MC/MCSymbol.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VariadicFunction.h" 46#include "llvm/Support/CallSite.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/ErrorHandling.h" 49#include "llvm/Support/MathExtras.h" 50#include "llvm/Target/TargetOptions.h" 51#include <bitset> 52#include <cctype> 53using namespace llvm; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57// Forward declarations. 58static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 59 SDValue V2); 60 61/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 62/// sets things up to match to an AVX VEXTRACTF128 instruction or a 63/// simple subregister reference. Idx is an index in the 128 bits we 64/// want. It need not be aligned to a 128-bit bounday. That makes 65/// lowering EXTRACT_VECTOR_ELT operations easier. 66static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, 67 SelectionDAG &DAG, DebugLoc dl) { 68 EVT VT = Vec.getValueType(); 69 assert(VT.is256BitVector() && "Unexpected vector size!"); 70 EVT ElVT = VT.getVectorElementType(); 71 unsigned Factor = VT.getSizeInBits()/128; 72 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 73 VT.getVectorNumElements()/Factor); 74 75 // Extract from UNDEF is UNDEF. 76 if (Vec.getOpcode() == ISD::UNDEF) 77 return DAG.getUNDEF(ResultVT); 78 79 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 80 // we can match to VEXTRACTF128. 81 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 82 83 // This is the index of the first element of the 128-bit chunk 84 // we want. 85 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 86 * ElemsPerChunk); 87 88 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 89 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 90 VecIdx); 91 92 return Result; 93} 94 95/// Generate a DAG to put 128-bits into a vector > 128 bits. This 96/// sets things up to match to an AVX VINSERTF128 instruction or a 97/// simple superregister reference. Idx is an index in the 128 bits 98/// we want. It need not be aligned to a 128-bit bounday. That makes 99/// lowering INSERT_VECTOR_ELT operations easier. 100static SDValue Insert128BitVector(SDValue Result, SDValue Vec, 101 unsigned IdxVal, SelectionDAG &DAG, 102 DebugLoc dl) { 103 // Inserting UNDEF is Result 104 if (Vec.getOpcode() == ISD::UNDEF) 105 return Result; 106 107 EVT VT = Vec.getValueType(); 108 assert(VT.is128BitVector() && "Unexpected vector size!"); 109 110 EVT ElVT = VT.getVectorElementType(); 111 EVT ResultVT = Result.getValueType(); 112 113 // Insert the relevant 128 bits. 114 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 115 116 // This is the index of the first element of the 128-bit chunk 117 // we want. 118 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 119 * ElemsPerChunk); 120 121 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 122 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 123 VecIdx); 124} 125 126/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 127/// instructions. This is used because creating CONCAT_VECTOR nodes of 128/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 129/// large BUILD_VECTORS. 130static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 131 unsigned NumElems, SelectionDAG &DAG, 132 DebugLoc dl) { 133 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 134 return Insert128BitVector(V, V2, NumElems/2, DAG, dl); 135} 136 137static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 138 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 139 bool is64Bit = Subtarget->is64Bit(); 140 141 if (Subtarget->isTargetEnvMacho()) { 142 if (is64Bit) 143 return new X86_64MachoTargetObjectFile(); 144 return new TargetLoweringObjectFileMachO(); 145 } 146 147 if (Subtarget->isTargetLinux()) 148 return new X86LinuxTargetObjectFile(); 149 if (Subtarget->isTargetELF()) 150 return new TargetLoweringObjectFileELF(); 151 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 152 return new TargetLoweringObjectFileCOFF(); 153 llvm_unreachable("unknown subtarget type"); 154} 155 156X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 157 : TargetLowering(TM, createTLOF(TM)) { 158 Subtarget = &TM.getSubtarget<X86Subtarget>(); 159 X86ScalarSSEf64 = Subtarget->hasSSE2(); 160 X86ScalarSSEf32 = Subtarget->hasSSE1(); 161 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 162 163 RegInfo = TM.getRegisterInfo(); 164 TD = getTargetData(); 165 166 // Set up the TargetLowering object. 167 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 168 169 // X86 is weird, it always uses i8 for shift amounts and setcc results. 170 setBooleanContents(ZeroOrOneBooleanContent); 171 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 172 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 173 174 // For 64-bit since we have so many registers use the ILP scheduler, for 175 // 32-bit code use the register pressure specific scheduling. 176 // For Atom, always use ILP scheduling. 177 if (Subtarget->isAtom()) 178 setSchedulingPreference(Sched::ILP); 179 else if (Subtarget->is64Bit()) 180 setSchedulingPreference(Sched::ILP); 181 else 182 setSchedulingPreference(Sched::RegPressure); 183 setStackPointerRegisterToSaveRestore(X86StackPtr); 184 185 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 186 // Setup Windows compiler runtime calls. 187 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 188 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 189 setLibcallName(RTLIB::SREM_I64, "_allrem"); 190 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 191 setLibcallName(RTLIB::MUL_I64, "_allmul"); 192 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 193 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 194 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 195 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 196 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 197 198 // The _ftol2 runtime function has an unusual calling conv, which 199 // is modeled by a special pseudo-instruction. 200 setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); 201 setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); 202 setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); 203 setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); 204 } 205 206 if (Subtarget->isTargetDarwin()) { 207 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 208 setUseUnderscoreSetJmp(false); 209 setUseUnderscoreLongJmp(false); 210 } else if (Subtarget->isTargetMingw()) { 211 // MS runtime is weird: it exports _setjmp, but longjmp! 212 setUseUnderscoreSetJmp(true); 213 setUseUnderscoreLongJmp(false); 214 } else { 215 setUseUnderscoreSetJmp(true); 216 setUseUnderscoreLongJmp(true); 217 } 218 219 // Set up the register classes. 220 addRegisterClass(MVT::i8, &X86::GR8RegClass); 221 addRegisterClass(MVT::i16, &X86::GR16RegClass); 222 addRegisterClass(MVT::i32, &X86::GR32RegClass); 223 if (Subtarget->is64Bit()) 224 addRegisterClass(MVT::i64, &X86::GR64RegClass); 225 226 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 227 228 // We don't accept any truncstore of integer registers. 229 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 230 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 231 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 232 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 233 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 234 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 235 236 // SETOEQ and SETUNE require checking two conditions. 237 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 238 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 239 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 240 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 241 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 242 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 243 244 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 245 // operation. 246 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 247 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 248 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 249 250 if (Subtarget->is64Bit()) { 251 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 252 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 253 } else if (!TM.Options.UseSoftFloat) { 254 // We have an algorithm for SSE2->double, and we turn this into a 255 // 64-bit FILD followed by conditional FADD for other targets. 256 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 257 // We have an algorithm for SSE2, and we turn this into a 64-bit 258 // FILD for other targets. 259 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 260 } 261 262 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 263 // this operation. 264 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 265 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 266 267 if (!TM.Options.UseSoftFloat) { 268 // SSE has no i16 to fp conversion, only i32 269 if (X86ScalarSSEf32) { 270 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 271 // f32 and f64 cases are Legal, f80 case is not 272 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 273 } else { 274 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 275 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 276 } 277 } else { 278 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 279 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 280 } 281 282 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 283 // are Legal, f80 is custom lowered. 284 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 285 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 286 287 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 288 // this operation. 289 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 290 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 291 292 if (X86ScalarSSEf32) { 293 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 294 // f32 and f64 cases are Legal, f80 case is not 295 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 296 } else { 297 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 298 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 299 } 300 301 // Handle FP_TO_UINT by promoting the destination to a larger signed 302 // conversion. 303 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 304 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 305 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 306 307 if (Subtarget->is64Bit()) { 308 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 309 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 310 } else if (!TM.Options.UseSoftFloat) { 311 // Since AVX is a superset of SSE3, only check for SSE here. 312 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 313 // Expand FP_TO_UINT into a select. 314 // FIXME: We would like to use a Custom expander here eventually to do 315 // the optimal thing for SSE vs. the default expansion in the legalizer. 316 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 317 else 318 // With SSE3 we can use fisttpll to convert to a signed i64; without 319 // SSE, we're stuck with a fistpll. 320 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 321 } 322 323 if (isTargetFTOL()) { 324 // Use the _ftol2 runtime function, which has a pseudo-instruction 325 // to handle its weird calling convention. 326 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 327 } 328 329 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 330 if (!X86ScalarSSEf64) { 331 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 332 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 333 if (Subtarget->is64Bit()) { 334 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 335 // Without SSE, i64->f64 goes through memory. 336 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 337 } 338 } 339 340 // Scalar integer divide and remainder are lowered to use operations that 341 // produce two results, to match the available instructions. This exposes 342 // the two-result form to trivial CSE, which is able to combine x/y and x%y 343 // into a single instruction. 344 // 345 // Scalar integer multiply-high is also lowered to use two-result 346 // operations, to match the available instructions. However, plain multiply 347 // (low) operations are left as Legal, as there are single-result 348 // instructions for this in x86. Using the two-result multiply instructions 349 // when both high and low results are needed must be arranged by dagcombine. 350 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 351 MVT VT = IntVTs[i]; 352 setOperationAction(ISD::MULHS, VT, Expand); 353 setOperationAction(ISD::MULHU, VT, Expand); 354 setOperationAction(ISD::SDIV, VT, Expand); 355 setOperationAction(ISD::UDIV, VT, Expand); 356 setOperationAction(ISD::SREM, VT, Expand); 357 setOperationAction(ISD::UREM, VT, Expand); 358 359 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 360 setOperationAction(ISD::ADDC, VT, Custom); 361 setOperationAction(ISD::ADDE, VT, Custom); 362 setOperationAction(ISD::SUBC, VT, Custom); 363 setOperationAction(ISD::SUBE, VT, Custom); 364 } 365 366 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 367 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 368 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 369 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 370 if (Subtarget->is64Bit()) 371 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 372 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 373 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 374 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 375 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 376 setOperationAction(ISD::FREM , MVT::f32 , Expand); 377 setOperationAction(ISD::FREM , MVT::f64 , Expand); 378 setOperationAction(ISD::FREM , MVT::f80 , Expand); 379 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 380 381 // Promote the i8 variants and force them on up to i32 which has a shorter 382 // encoding. 383 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 384 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 385 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 386 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 387 if (Subtarget->hasBMI()) { 388 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 389 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 390 if (Subtarget->is64Bit()) 391 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 392 } else { 393 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 394 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 395 if (Subtarget->is64Bit()) 396 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 397 } 398 399 if (Subtarget->hasLZCNT()) { 400 // When promoting the i8 variants, force them to i32 for a shorter 401 // encoding. 402 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 403 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 404 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 405 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 406 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 407 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 408 if (Subtarget->is64Bit()) 409 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 410 } else { 411 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 412 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 413 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 414 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 415 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 416 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 417 if (Subtarget->is64Bit()) { 418 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 419 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 420 } 421 } 422 423 if (Subtarget->hasPOPCNT()) { 424 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 425 } else { 426 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 427 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 428 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 429 if (Subtarget->is64Bit()) 430 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 431 } 432 433 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 434 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 435 436 // These should be promoted to a larger select which is supported. 437 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 438 // X86 wants to expand cmov itself. 439 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 440 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 441 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 442 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 443 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 444 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 445 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 446 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 447 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 448 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 449 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 450 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 451 if (Subtarget->is64Bit()) { 452 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 453 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 454 } 455 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 456 457 // Darwin ABI issue. 458 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 459 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 460 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 461 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 462 if (Subtarget->is64Bit()) 463 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 464 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 465 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 466 if (Subtarget->is64Bit()) { 467 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 468 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 469 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 470 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 471 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 472 } 473 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 474 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 475 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 476 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 477 if (Subtarget->is64Bit()) { 478 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 479 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 480 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 481 } 482 483 if (Subtarget->hasSSE1()) 484 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 485 486 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 487 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 488 489 // On X86 and X86-64, atomic operations are lowered to locked instructions. 490 // Locked instructions, in turn, have implicit fence semantics (all memory 491 // operations are flushed before issuing the locked instruction, and they 492 // are not buffered), so we can fold away the common pattern of 493 // fence-atomic-fence. 494 setShouldFoldAtomicFences(true); 495 496 // Expand certain atomics 497 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 498 MVT VT = IntVTs[i]; 499 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 500 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 501 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 502 } 503 504 if (!Subtarget->is64Bit()) { 505 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); 506 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 507 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 508 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 509 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 510 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 511 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 512 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 513 } 514 515 if (Subtarget->hasCmpxchg16b()) { 516 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 517 } 518 519 // FIXME - use subtarget debug flags 520 if (!Subtarget->isTargetDarwin() && 521 !Subtarget->isTargetELF() && 522 !Subtarget->isTargetCygMing()) { 523 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 524 } 525 526 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 527 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 528 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 529 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 530 if (Subtarget->is64Bit()) { 531 setExceptionPointerRegister(X86::RAX); 532 setExceptionSelectorRegister(X86::RDX); 533 } else { 534 setExceptionPointerRegister(X86::EAX); 535 setExceptionSelectorRegister(X86::EDX); 536 } 537 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 538 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 539 540 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 541 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 542 543 setOperationAction(ISD::TRAP, MVT::Other, Legal); 544 545 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 546 setOperationAction(ISD::VASTART , MVT::Other, Custom); 547 setOperationAction(ISD::VAEND , MVT::Other, Expand); 548 if (Subtarget->is64Bit()) { 549 setOperationAction(ISD::VAARG , MVT::Other, Custom); 550 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 551 } else { 552 setOperationAction(ISD::VAARG , MVT::Other, Expand); 553 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 554 } 555 556 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 557 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 558 559 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 560 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 561 MVT::i64 : MVT::i32, Custom); 562 else if (TM.Options.EnableSegmentedStacks) 563 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 564 MVT::i64 : MVT::i32, Custom); 565 else 566 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 567 MVT::i64 : MVT::i32, Expand); 568 569 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 570 // f32 and f64 use SSE. 571 // Set up the FP register classes. 572 addRegisterClass(MVT::f32, &X86::FR32RegClass); 573 addRegisterClass(MVT::f64, &X86::FR64RegClass); 574 575 // Use ANDPD to simulate FABS. 576 setOperationAction(ISD::FABS , MVT::f64, Custom); 577 setOperationAction(ISD::FABS , MVT::f32, Custom); 578 579 // Use XORP to simulate FNEG. 580 setOperationAction(ISD::FNEG , MVT::f64, Custom); 581 setOperationAction(ISD::FNEG , MVT::f32, Custom); 582 583 // Use ANDPD and ORPD to simulate FCOPYSIGN. 584 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 585 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 586 587 // Lower this to FGETSIGNx86 plus an AND. 588 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 589 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 590 591 // We don't support sin/cos/fmod 592 setOperationAction(ISD::FSIN , MVT::f64, Expand); 593 setOperationAction(ISD::FCOS , MVT::f64, Expand); 594 setOperationAction(ISD::FSIN , MVT::f32, Expand); 595 setOperationAction(ISD::FCOS , MVT::f32, Expand); 596 597 // Expand FP immediates into loads from the stack, except for the special 598 // cases we handle. 599 addLegalFPImmediate(APFloat(+0.0)); // xorpd 600 addLegalFPImmediate(APFloat(+0.0f)); // xorps 601 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 602 // Use SSE for f32, x87 for f64. 603 // Set up the FP register classes. 604 addRegisterClass(MVT::f32, &X86::FR32RegClass); 605 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 606 607 // Use ANDPS to simulate FABS. 608 setOperationAction(ISD::FABS , MVT::f32, Custom); 609 610 // Use XORP to simulate FNEG. 611 setOperationAction(ISD::FNEG , MVT::f32, Custom); 612 613 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 614 615 // Use ANDPS and ORPS to simulate FCOPYSIGN. 616 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 617 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 618 619 // We don't support sin/cos/fmod 620 setOperationAction(ISD::FSIN , MVT::f32, Expand); 621 setOperationAction(ISD::FCOS , MVT::f32, Expand); 622 623 // Special cases we handle for FP constants. 624 addLegalFPImmediate(APFloat(+0.0f)); // xorps 625 addLegalFPImmediate(APFloat(+0.0)); // FLD0 626 addLegalFPImmediate(APFloat(+1.0)); // FLD1 627 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 628 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 629 630 if (!TM.Options.UnsafeFPMath) { 631 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 632 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 633 } 634 } else if (!TM.Options.UseSoftFloat) { 635 // f32 and f64 in x87. 636 // Set up the FP register classes. 637 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 638 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 639 640 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 641 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 642 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 643 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 644 645 if (!TM.Options.UnsafeFPMath) { 646 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 647 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 648 } 649 addLegalFPImmediate(APFloat(+0.0)); // FLD0 650 addLegalFPImmediate(APFloat(+1.0)); // FLD1 651 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 652 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 653 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 654 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 655 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 656 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 657 } 658 659 // We don't support FMA. 660 setOperationAction(ISD::FMA, MVT::f64, Expand); 661 setOperationAction(ISD::FMA, MVT::f32, Expand); 662 663 // Long double always uses X87. 664 if (!TM.Options.UseSoftFloat) { 665 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 666 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 667 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 668 { 669 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 670 addLegalFPImmediate(TmpFlt); // FLD0 671 TmpFlt.changeSign(); 672 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 673 674 bool ignored; 675 APFloat TmpFlt2(+1.0); 676 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 677 &ignored); 678 addLegalFPImmediate(TmpFlt2); // FLD1 679 TmpFlt2.changeSign(); 680 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 681 } 682 683 if (!TM.Options.UnsafeFPMath) { 684 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 685 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 686 } 687 688 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 689 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 690 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 691 setOperationAction(ISD::FRINT, MVT::f80, Expand); 692 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 693 setOperationAction(ISD::FMA, MVT::f80, Expand); 694 } 695 696 // Always use a library call for pow. 697 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 698 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 699 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 700 701 setOperationAction(ISD::FLOG, MVT::f80, Expand); 702 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 703 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 704 setOperationAction(ISD::FEXP, MVT::f80, Expand); 705 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 706 707 // First set operation action for all vector types to either promote 708 // (for widening) or expand (for scalarization). Then we will selectively 709 // turn on ones that can be effectively codegen'd. 710 for (int VT = MVT::FIRST_VECTOR_VALUETYPE; 711 VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) { 712 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 713 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 714 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 715 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 716 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 717 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 718 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 719 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 720 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 721 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 722 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 723 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 724 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 725 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 726 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 727 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 728 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 729 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 730 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 731 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 732 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 733 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 734 setOperationAction(ISD::FMA, (MVT::SimpleValueType)VT, Expand); 735 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 736 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 737 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 738 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 739 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 740 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 741 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 742 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 743 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 744 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 745 setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); 746 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 747 setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); 748 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 749 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 750 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 751 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 752 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 753 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 754 setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand); 755 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 756 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 757 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 758 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 759 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 760 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 761 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 762 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 763 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 764 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 765 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 766 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 767 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 768 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 769 setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand); 770 for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; 771 InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 772 setTruncStoreAction((MVT::SimpleValueType)VT, 773 (MVT::SimpleValueType)InnerVT, Expand); 774 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 775 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 776 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 777 } 778 779 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 780 // with -msoft-float, disable use of MMX as well. 781 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 782 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 783 // No operations on x86mmx supported, everything uses intrinsics. 784 } 785 786 // MMX-sized vectors (other than x86mmx) are expected to be expanded 787 // into smaller operations. 788 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 789 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 790 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 791 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 792 setOperationAction(ISD::AND, MVT::v8i8, Expand); 793 setOperationAction(ISD::AND, MVT::v4i16, Expand); 794 setOperationAction(ISD::AND, MVT::v2i32, Expand); 795 setOperationAction(ISD::AND, MVT::v1i64, Expand); 796 setOperationAction(ISD::OR, MVT::v8i8, Expand); 797 setOperationAction(ISD::OR, MVT::v4i16, Expand); 798 setOperationAction(ISD::OR, MVT::v2i32, Expand); 799 setOperationAction(ISD::OR, MVT::v1i64, Expand); 800 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 801 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 802 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 803 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 804 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 805 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 806 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 807 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 808 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 809 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 810 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 811 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 812 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 813 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 814 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 815 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 816 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 817 818 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { 819 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 820 821 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 822 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 823 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 824 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 825 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 826 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 827 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 828 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 829 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 830 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 831 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 832 } 833 834 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { 835 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 836 837 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 838 // registers cannot be used even for integer operations. 839 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 840 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 841 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 842 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 843 844 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 845 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 846 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 847 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 848 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 849 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 850 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 851 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 852 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 853 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 854 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 855 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 856 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 857 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 858 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 859 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 860 861 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 862 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 863 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 864 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 865 866 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 867 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 868 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 869 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 870 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 871 872 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 873 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 874 MVT VT = (MVT::SimpleValueType)i; 875 // Do not attempt to custom lower non-power-of-2 vectors 876 if (!isPowerOf2_32(VT.getVectorNumElements())) 877 continue; 878 // Do not attempt to custom lower non-128-bit vectors 879 if (!VT.is128BitVector()) 880 continue; 881 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 882 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 883 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 884 } 885 886 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 887 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 888 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 889 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 890 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 891 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 892 893 if (Subtarget->is64Bit()) { 894 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 895 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 896 } 897 898 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 899 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 900 MVT VT = (MVT::SimpleValueType)i; 901 902 // Do not attempt to promote non-128-bit vectors 903 if (!VT.is128BitVector()) 904 continue; 905 906 setOperationAction(ISD::AND, VT, Promote); 907 AddPromotedToType (ISD::AND, VT, MVT::v2i64); 908 setOperationAction(ISD::OR, VT, Promote); 909 AddPromotedToType (ISD::OR, VT, MVT::v2i64); 910 setOperationAction(ISD::XOR, VT, Promote); 911 AddPromotedToType (ISD::XOR, VT, MVT::v2i64); 912 setOperationAction(ISD::LOAD, VT, Promote); 913 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); 914 setOperationAction(ISD::SELECT, VT, Promote); 915 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); 916 } 917 918 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 919 920 // Custom lower v2i64 and v2f64 selects. 921 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 922 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 923 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 924 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 925 926 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 927 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 928 } 929 930 if (Subtarget->hasSSE41()) { 931 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 932 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 933 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 934 setOperationAction(ISD::FRINT, MVT::f32, Legal); 935 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 936 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 937 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 938 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 939 setOperationAction(ISD::FRINT, MVT::f64, Legal); 940 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 941 942 // FIXME: Do we need to handle scalar-to-vector here? 943 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 944 945 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 946 setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); 947 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 948 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 949 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 950 951 // i8 and i16 vectors are custom , because the source register and source 952 // source memory operand types are not the same width. f32 vectors are 953 // custom since the immediate controlling the insert encodes additional 954 // information. 955 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 956 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 957 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 958 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 959 960 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 961 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 962 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 963 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 964 965 // FIXME: these should be Legal but thats only for the case where 966 // the index is constant. For now custom expand to deal with that. 967 if (Subtarget->is64Bit()) { 968 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 969 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 970 } 971 } 972 973 if (Subtarget->hasSSE2()) { 974 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 975 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 976 977 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 978 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 979 980 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 981 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 982 983 if (Subtarget->hasAVX2()) { 984 setOperationAction(ISD::SRL, MVT::v2i64, Legal); 985 setOperationAction(ISD::SRL, MVT::v4i32, Legal); 986 987 setOperationAction(ISD::SHL, MVT::v2i64, Legal); 988 setOperationAction(ISD::SHL, MVT::v4i32, Legal); 989 990 setOperationAction(ISD::SRA, MVT::v4i32, Legal); 991 } else { 992 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 993 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 994 995 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 996 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 997 998 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 999 } 1000 } 1001 1002 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { 1003 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 1004 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 1005 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 1006 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 1007 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 1008 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 1009 1010 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1011 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1012 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1013 1014 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1015 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1016 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1017 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1018 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1019 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1020 1021 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1022 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1023 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1024 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1025 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1026 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1027 1028 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1029 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1030 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1031 1032 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1033 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1034 1035 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1036 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1037 1038 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1039 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1040 1041 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1042 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1043 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1044 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1045 1046 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1047 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1048 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1049 1050 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 1051 setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); 1052 setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); 1053 setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); 1054 1055 if (Subtarget->hasFMA()) { 1056 setOperationAction(ISD::FMA, MVT::v8f32, Custom); 1057 setOperationAction(ISD::FMA, MVT::v4f64, Custom); 1058 setOperationAction(ISD::FMA, MVT::v4f32, Custom); 1059 setOperationAction(ISD::FMA, MVT::v2f64, Custom); 1060 setOperationAction(ISD::FMA, MVT::f32, Custom); 1061 setOperationAction(ISD::FMA, MVT::f64, Custom); 1062 } 1063 1064 if (Subtarget->hasAVX2()) { 1065 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1066 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1067 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1068 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1069 1070 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1071 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1072 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1073 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1074 1075 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1076 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1077 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1078 // Don't lower v32i8 because there is no 128-bit byte mul 1079 1080 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1081 1082 setOperationAction(ISD::SRL, MVT::v4i64, Legal); 1083 setOperationAction(ISD::SRL, MVT::v8i32, Legal); 1084 1085 setOperationAction(ISD::SHL, MVT::v4i64, Legal); 1086 setOperationAction(ISD::SHL, MVT::v8i32, Legal); 1087 1088 setOperationAction(ISD::SRA, MVT::v8i32, Legal); 1089 } else { 1090 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1091 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1092 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1093 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1094 1095 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1096 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1097 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1098 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1099 1100 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1101 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1102 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1103 // Don't lower v32i8 because there is no 128-bit byte mul 1104 1105 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1106 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1107 1108 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1109 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1110 1111 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1112 } 1113 1114 // Custom lower several nodes for 256-bit types. 1115 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 1116 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 1117 MVT VT = (MVT::SimpleValueType)i; 1118 1119 // Extract subvector is special because the value type 1120 // (result) is 128-bit but the source is 256-bit wide. 1121 if (VT.is128BitVector()) 1122 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1123 1124 // Do not attempt to custom lower other non-256-bit vectors 1125 if (!VT.is256BitVector()) 1126 continue; 1127 1128 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1129 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1130 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1131 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1132 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 1133 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1134 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1135 } 1136 1137 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1138 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { 1139 MVT VT = (MVT::SimpleValueType)i; 1140 1141 // Do not attempt to promote non-256-bit vectors 1142 if (!VT.is256BitVector()) 1143 continue; 1144 1145 setOperationAction(ISD::AND, VT, Promote); 1146 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 1147 setOperationAction(ISD::OR, VT, Promote); 1148 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 1149 setOperationAction(ISD::XOR, VT, Promote); 1150 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 1151 setOperationAction(ISD::LOAD, VT, Promote); 1152 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 1153 setOperationAction(ISD::SELECT, VT, Promote); 1154 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 1155 } 1156 } 1157 1158 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1159 // of this type with custom code. 1160 for (int VT = MVT::FIRST_VECTOR_VALUETYPE; 1161 VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { 1162 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, 1163 Custom); 1164 } 1165 1166 // We want to custom lower some of our intrinsics. 1167 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1168 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1169 1170 1171 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1172 // handle type legalization for these operations here. 1173 // 1174 // FIXME: We really should do custom legalization for addition and 1175 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1176 // than generic legalization for 64-bit multiplication-with-overflow, though. 1177 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1178 // Add/Sub/Mul with overflow operations are custom lowered. 1179 MVT VT = IntVTs[i]; 1180 setOperationAction(ISD::SADDO, VT, Custom); 1181 setOperationAction(ISD::UADDO, VT, Custom); 1182 setOperationAction(ISD::SSUBO, VT, Custom); 1183 setOperationAction(ISD::USUBO, VT, Custom); 1184 setOperationAction(ISD::SMULO, VT, Custom); 1185 setOperationAction(ISD::UMULO, VT, Custom); 1186 } 1187 1188 // There are no 8-bit 3-address imul/mul instructions 1189 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1190 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1191 1192 if (!Subtarget->is64Bit()) { 1193 // These libcalls are not available in 32-bit. 1194 setLibcallName(RTLIB::SHL_I128, 0); 1195 setLibcallName(RTLIB::SRL_I128, 0); 1196 setLibcallName(RTLIB::SRA_I128, 0); 1197 } 1198 1199 // We have target-specific dag combine patterns for the following nodes: 1200 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1201 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1202 setTargetDAGCombine(ISD::VSELECT); 1203 setTargetDAGCombine(ISD::SELECT); 1204 setTargetDAGCombine(ISD::SHL); 1205 setTargetDAGCombine(ISD::SRA); 1206 setTargetDAGCombine(ISD::SRL); 1207 setTargetDAGCombine(ISD::OR); 1208 setTargetDAGCombine(ISD::AND); 1209 setTargetDAGCombine(ISD::ADD); 1210 setTargetDAGCombine(ISD::FADD); 1211 setTargetDAGCombine(ISD::FSUB); 1212 setTargetDAGCombine(ISD::FMA); 1213 setTargetDAGCombine(ISD::SUB); 1214 setTargetDAGCombine(ISD::LOAD); 1215 setTargetDAGCombine(ISD::STORE); 1216 setTargetDAGCombine(ISD::ZERO_EXTEND); 1217 setTargetDAGCombine(ISD::ANY_EXTEND); 1218 setTargetDAGCombine(ISD::SIGN_EXTEND); 1219 setTargetDAGCombine(ISD::TRUNCATE); 1220 setTargetDAGCombine(ISD::UINT_TO_FP); 1221 setTargetDAGCombine(ISD::SINT_TO_FP); 1222 setTargetDAGCombine(ISD::SETCC); 1223 setTargetDAGCombine(ISD::FP_TO_SINT); 1224 if (Subtarget->is64Bit()) 1225 setTargetDAGCombine(ISD::MUL); 1226 setTargetDAGCombine(ISD::XOR); 1227 1228 computeRegisterProperties(); 1229 1230 // On Darwin, -Os means optimize for size without hurting performance, 1231 // do not reduce the limit. 1232 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1233 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1234 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1235 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1236 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1237 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1238 setPrefLoopAlignment(4); // 2^4 bytes. 1239 benefitFromCodePlacementOpt = true; 1240 1241 // Predictable cmov don't hurt on atom because it's in-order. 1242 predictableSelectIsExpensive = !Subtarget->isAtom(); 1243 1244 setPrefFunctionAlignment(4); // 2^4 bytes. 1245} 1246 1247 1248EVT X86TargetLowering::getSetCCResultType(EVT VT) const { 1249 if (!VT.isVector()) return MVT::i8; 1250 return VT.changeVectorElementTypeToInteger(); 1251} 1252 1253 1254/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1255/// the desired ByVal argument alignment. 1256static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1257 if (MaxAlign == 16) 1258 return; 1259 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1260 if (VTy->getBitWidth() == 128) 1261 MaxAlign = 16; 1262 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1263 unsigned EltAlign = 0; 1264 getMaxByValAlign(ATy->getElementType(), EltAlign); 1265 if (EltAlign > MaxAlign) 1266 MaxAlign = EltAlign; 1267 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1268 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1269 unsigned EltAlign = 0; 1270 getMaxByValAlign(STy->getElementType(i), EltAlign); 1271 if (EltAlign > MaxAlign) 1272 MaxAlign = EltAlign; 1273 if (MaxAlign == 16) 1274 break; 1275 } 1276 } 1277} 1278 1279/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1280/// function arguments in the caller parameter area. For X86, aggregates 1281/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1282/// are at 4-byte boundaries. 1283unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1284 if (Subtarget->is64Bit()) { 1285 // Max of 8 and alignment of type. 1286 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1287 if (TyAlign > 8) 1288 return TyAlign; 1289 return 8; 1290 } 1291 1292 unsigned Align = 4; 1293 if (Subtarget->hasSSE1()) 1294 getMaxByValAlign(Ty, Align); 1295 return Align; 1296} 1297 1298/// getOptimalMemOpType - Returns the target specific optimal type for load 1299/// and store operations as a result of memset, memcpy, and memmove 1300/// lowering. If DstAlign is zero that means it's safe to destination 1301/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1302/// means there isn't a need to check it against alignment requirement, 1303/// probably because the source does not need to be loaded. If 1304/// 'IsZeroVal' is true, that means it's safe to return a 1305/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1306/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1307/// constant so it does not need to be loaded. 1308/// It returns EVT::Other if the type should be determined using generic 1309/// target-independent logic. 1310EVT 1311X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1312 unsigned DstAlign, unsigned SrcAlign, 1313 bool IsZeroVal, 1314 bool MemcpyStrSrc, 1315 MachineFunction &MF) const { 1316 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1317 // linux. This is because the stack realignment code can't handle certain 1318 // cases like PR2962. This should be removed when PR2962 is fixed. 1319 const Function *F = MF.getFunction(); 1320 if (IsZeroVal && 1321 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1322 if (Size >= 16 && 1323 (Subtarget->isUnalignedMemAccessFast() || 1324 ((DstAlign == 0 || DstAlign >= 16) && 1325 (SrcAlign == 0 || SrcAlign >= 16))) && 1326 Subtarget->getStackAlignment() >= 16) { 1327 if (Subtarget->getStackAlignment() >= 32) { 1328 if (Subtarget->hasAVX2()) 1329 return MVT::v8i32; 1330 if (Subtarget->hasAVX()) 1331 return MVT::v8f32; 1332 } 1333 if (Subtarget->hasSSE2()) 1334 return MVT::v4i32; 1335 if (Subtarget->hasSSE1()) 1336 return MVT::v4f32; 1337 } else if (!MemcpyStrSrc && Size >= 8 && 1338 !Subtarget->is64Bit() && 1339 Subtarget->getStackAlignment() >= 8 && 1340 Subtarget->hasSSE2()) { 1341 // Do not use f64 to lower memcpy if source is string constant. It's 1342 // better to use i32 to avoid the loads. 1343 return MVT::f64; 1344 } 1345 } 1346 if (Subtarget->is64Bit() && Size >= 8) 1347 return MVT::i64; 1348 return MVT::i32; 1349} 1350 1351/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1352/// current function. The returned value is a member of the 1353/// MachineJumpTableInfo::JTEntryKind enum. 1354unsigned X86TargetLowering::getJumpTableEncoding() const { 1355 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1356 // symbol. 1357 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1358 Subtarget->isPICStyleGOT()) 1359 return MachineJumpTableInfo::EK_Custom32; 1360 1361 // Otherwise, use the normal jump table encoding heuristics. 1362 return TargetLowering::getJumpTableEncoding(); 1363} 1364 1365const MCExpr * 1366X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1367 const MachineBasicBlock *MBB, 1368 unsigned uid,MCContext &Ctx) const{ 1369 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1370 Subtarget->isPICStyleGOT()); 1371 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1372 // entries. 1373 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1374 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1375} 1376 1377/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1378/// jumptable. 1379SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1380 SelectionDAG &DAG) const { 1381 if (!Subtarget->is64Bit()) 1382 // This doesn't have DebugLoc associated with it, but is not really the 1383 // same as a Register. 1384 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1385 return Table; 1386} 1387 1388/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1389/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1390/// MCExpr. 1391const MCExpr *X86TargetLowering:: 1392getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1393 MCContext &Ctx) const { 1394 // X86-64 uses RIP relative addressing based on the jump table label. 1395 if (Subtarget->isPICStyleRIPRel()) 1396 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1397 1398 // Otherwise, the reference is relative to the PIC base. 1399 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1400} 1401 1402// FIXME: Why this routine is here? Move to RegInfo! 1403std::pair<const TargetRegisterClass*, uint8_t> 1404X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1405 const TargetRegisterClass *RRC = 0; 1406 uint8_t Cost = 1; 1407 switch (VT.getSimpleVT().SimpleTy) { 1408 default: 1409 return TargetLowering::findRepresentativeClass(VT); 1410 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1411 RRC = Subtarget->is64Bit() ? 1412 (const TargetRegisterClass*)&X86::GR64RegClass : 1413 (const TargetRegisterClass*)&X86::GR32RegClass; 1414 break; 1415 case MVT::x86mmx: 1416 RRC = &X86::VR64RegClass; 1417 break; 1418 case MVT::f32: case MVT::f64: 1419 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1420 case MVT::v4f32: case MVT::v2f64: 1421 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1422 case MVT::v4f64: 1423 RRC = &X86::VR128RegClass; 1424 break; 1425 } 1426 return std::make_pair(RRC, Cost); 1427} 1428 1429bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1430 unsigned &Offset) const { 1431 if (!Subtarget->isTargetLinux()) 1432 return false; 1433 1434 if (Subtarget->is64Bit()) { 1435 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1436 Offset = 0x28; 1437 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1438 AddressSpace = 256; 1439 else 1440 AddressSpace = 257; 1441 } else { 1442 // %gs:0x14 on i386 1443 Offset = 0x14; 1444 AddressSpace = 256; 1445 } 1446 return true; 1447} 1448 1449 1450//===----------------------------------------------------------------------===// 1451// Return Value Calling Convention Implementation 1452//===----------------------------------------------------------------------===// 1453 1454#include "X86GenCallingConv.inc" 1455 1456bool 1457X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1458 MachineFunction &MF, bool isVarArg, 1459 const SmallVectorImpl<ISD::OutputArg> &Outs, 1460 LLVMContext &Context) const { 1461 SmallVector<CCValAssign, 16> RVLocs; 1462 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1463 RVLocs, Context); 1464 return CCInfo.CheckReturn(Outs, RetCC_X86); 1465} 1466 1467SDValue 1468X86TargetLowering::LowerReturn(SDValue Chain, 1469 CallingConv::ID CallConv, bool isVarArg, 1470 const SmallVectorImpl<ISD::OutputArg> &Outs, 1471 const SmallVectorImpl<SDValue> &OutVals, 1472 DebugLoc dl, SelectionDAG &DAG) const { 1473 MachineFunction &MF = DAG.getMachineFunction(); 1474 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1475 1476 SmallVector<CCValAssign, 16> RVLocs; 1477 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1478 RVLocs, *DAG.getContext()); 1479 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1480 1481 // Add the regs to the liveout set for the function. 1482 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1483 for (unsigned i = 0; i != RVLocs.size(); ++i) 1484 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1485 MRI.addLiveOut(RVLocs[i].getLocReg()); 1486 1487 SDValue Flag; 1488 1489 SmallVector<SDValue, 6> RetOps; 1490 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1491 // Operand #1 = Bytes To Pop 1492 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1493 MVT::i16)); 1494 1495 // Copy the result values into the output registers. 1496 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1497 CCValAssign &VA = RVLocs[i]; 1498 assert(VA.isRegLoc() && "Can only return in registers!"); 1499 SDValue ValToCopy = OutVals[i]; 1500 EVT ValVT = ValToCopy.getValueType(); 1501 1502 // Promote values to the appropriate types 1503 if (VA.getLocInfo() == CCValAssign::SExt) 1504 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 1505 else if (VA.getLocInfo() == CCValAssign::ZExt) 1506 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 1507 else if (VA.getLocInfo() == CCValAssign::AExt) 1508 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 1509 else if (VA.getLocInfo() == CCValAssign::BCvt) 1510 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); 1511 1512 // If this is x86-64, and we disabled SSE, we can't return FP values, 1513 // or SSE or MMX vectors. 1514 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1515 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1516 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1517 report_fatal_error("SSE register return with SSE disabled"); 1518 } 1519 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1520 // llvm-gcc has never done it right and no one has noticed, so this 1521 // should be OK for now. 1522 if (ValVT == MVT::f64 && 1523 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1524 report_fatal_error("SSE2 register return with SSE2 disabled"); 1525 1526 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1527 // the RET instruction and handled by the FP Stackifier. 1528 if (VA.getLocReg() == X86::ST0 || 1529 VA.getLocReg() == X86::ST1) { 1530 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1531 // change the value to the FP stack register class. 1532 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1533 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1534 RetOps.push_back(ValToCopy); 1535 // Don't emit a copytoreg. 1536 continue; 1537 } 1538 1539 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1540 // which is returned in RAX / RDX. 1541 if (Subtarget->is64Bit()) { 1542 if (ValVT == MVT::x86mmx) { 1543 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1544 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1545 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1546 ValToCopy); 1547 // If we don't have SSE2 available, convert to v4f32 so the generated 1548 // register is legal. 1549 if (!Subtarget->hasSSE2()) 1550 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1551 } 1552 } 1553 } 1554 1555 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1556 Flag = Chain.getValue(1); 1557 } 1558 1559 // The x86-64 ABI for returning structs by value requires that we copy 1560 // the sret argument into %rax for the return. We saved the argument into 1561 // a virtual register in the entry block, so now we copy the value out 1562 // and into %rax. 1563 if (Subtarget->is64Bit() && 1564 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1565 MachineFunction &MF = DAG.getMachineFunction(); 1566 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1567 unsigned Reg = FuncInfo->getSRetReturnReg(); 1568 assert(Reg && 1569 "SRetReturnReg should have been set in LowerFormalArguments()."); 1570 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1571 1572 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1573 Flag = Chain.getValue(1); 1574 1575 // RAX now acts like a return value. 1576 MRI.addLiveOut(X86::RAX); 1577 } 1578 1579 RetOps[0] = Chain; // Update chain. 1580 1581 // Add the flag if we have it. 1582 if (Flag.getNode()) 1583 RetOps.push_back(Flag); 1584 1585 return DAG.getNode(X86ISD::RET_FLAG, dl, 1586 MVT::Other, &RetOps[0], RetOps.size()); 1587} 1588 1589bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 1590 if (N->getNumValues() != 1) 1591 return false; 1592 if (!N->hasNUsesOfValue(1, 0)) 1593 return false; 1594 1595 SDValue TCChain = Chain; 1596 SDNode *Copy = *N->use_begin(); 1597 if (Copy->getOpcode() == ISD::CopyToReg) { 1598 // If the copy has a glue operand, we conservatively assume it isn't safe to 1599 // perform a tail call. 1600 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 1601 return false; 1602 TCChain = Copy->getOperand(0); 1603 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 1604 return false; 1605 1606 bool HasRet = false; 1607 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1608 UI != UE; ++UI) { 1609 if (UI->getOpcode() != X86ISD::RET_FLAG) 1610 return false; 1611 HasRet = true; 1612 } 1613 1614 if (!HasRet) 1615 return false; 1616 1617 Chain = TCChain; 1618 return true; 1619} 1620 1621EVT 1622X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1623 ISD::NodeType ExtendKind) const { 1624 MVT ReturnMVT; 1625 // TODO: Is this also valid on 32-bit? 1626 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1627 ReturnMVT = MVT::i8; 1628 else 1629 ReturnMVT = MVT::i32; 1630 1631 EVT MinVT = getRegisterType(Context, ReturnMVT); 1632 return VT.bitsLT(MinVT) ? MinVT : VT; 1633} 1634 1635/// LowerCallResult - Lower the result values of a call into the 1636/// appropriate copies out of appropriate physical registers. 1637/// 1638SDValue 1639X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1640 CallingConv::ID CallConv, bool isVarArg, 1641 const SmallVectorImpl<ISD::InputArg> &Ins, 1642 DebugLoc dl, SelectionDAG &DAG, 1643 SmallVectorImpl<SDValue> &InVals) const { 1644 1645 // Assign locations to each value returned by this call. 1646 SmallVector<CCValAssign, 16> RVLocs; 1647 bool Is64Bit = Subtarget->is64Bit(); 1648 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1649 getTargetMachine(), RVLocs, *DAG.getContext()); 1650 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1651 1652 // Copy all of the result registers out of their specified physreg. 1653 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1654 CCValAssign &VA = RVLocs[i]; 1655 EVT CopyVT = VA.getValVT(); 1656 1657 // If this is x86-64, and we disabled SSE, we can't return FP values 1658 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1659 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1660 report_fatal_error("SSE register return with SSE disabled"); 1661 } 1662 1663 SDValue Val; 1664 1665 // If this is a call to a function that returns an fp value on the floating 1666 // point stack, we must guarantee the value is popped from the stack, so 1667 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1668 // if the return value is not used. We use the FpPOP_RETVAL instruction 1669 // instead. 1670 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1671 // If we prefer to use the value in xmm registers, copy it out as f80 and 1672 // use a truncate to move it from fp stack reg to xmm reg. 1673 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1674 SDValue Ops[] = { Chain, InFlag }; 1675 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1676 MVT::Other, MVT::Glue, Ops, 2), 1); 1677 Val = Chain.getValue(0); 1678 1679 // Round the f80 to the right size, which also moves it to the appropriate 1680 // xmm register. 1681 if (CopyVT != VA.getValVT()) 1682 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1683 // This truncation won't change the value. 1684 DAG.getIntPtrConstant(1)); 1685 } else { 1686 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1687 CopyVT, InFlag).getValue(1); 1688 Val = Chain.getValue(0); 1689 } 1690 InFlag = Chain.getValue(2); 1691 InVals.push_back(Val); 1692 } 1693 1694 return Chain; 1695} 1696 1697 1698//===----------------------------------------------------------------------===// 1699// C & StdCall & Fast Calling Convention implementation 1700//===----------------------------------------------------------------------===// 1701// StdCall calling convention seems to be standard for many Windows' API 1702// routines and around. It differs from C calling convention just a little: 1703// callee should clean up the stack, not caller. Symbols should be also 1704// decorated in some fancy way :) It doesn't support any vector arguments. 1705// For info on fast calling convention see Fast Calling Convention (tail call) 1706// implementation LowerX86_32FastCCCallTo. 1707 1708/// CallIsStructReturn - Determines whether a call uses struct return 1709/// semantics. 1710enum StructReturnType { 1711 NotStructReturn, 1712 RegStructReturn, 1713 StackStructReturn 1714}; 1715static StructReturnType 1716callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1717 if (Outs.empty()) 1718 return NotStructReturn; 1719 1720 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 1721 if (!Flags.isSRet()) 1722 return NotStructReturn; 1723 if (Flags.isInReg()) 1724 return RegStructReturn; 1725 return StackStructReturn; 1726} 1727 1728/// ArgsAreStructReturn - Determines whether a function uses struct 1729/// return semantics. 1730static StructReturnType 1731argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1732 if (Ins.empty()) 1733 return NotStructReturn; 1734 1735 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 1736 if (!Flags.isSRet()) 1737 return NotStructReturn; 1738 if (Flags.isInReg()) 1739 return RegStructReturn; 1740 return StackStructReturn; 1741} 1742 1743/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1744/// by "Src" to address "Dst" with size and alignment information specified by 1745/// the specific parameter attribute. The copy will be passed as a byval 1746/// function parameter. 1747static SDValue 1748CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1749 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1750 DebugLoc dl) { 1751 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1752 1753 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1754 /*isVolatile*/false, /*AlwaysInline=*/true, 1755 MachinePointerInfo(), MachinePointerInfo()); 1756} 1757 1758/// IsTailCallConvention - Return true if the calling convention is one that 1759/// supports tail call optimization. 1760static bool IsTailCallConvention(CallingConv::ID CC) { 1761 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1762} 1763 1764bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1765 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) 1766 return false; 1767 1768 CallSite CS(CI); 1769 CallingConv::ID CalleeCC = CS.getCallingConv(); 1770 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1771 return false; 1772 1773 return true; 1774} 1775 1776/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1777/// a tailcall target by changing its ABI. 1778static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 1779 bool GuaranteedTailCallOpt) { 1780 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1781} 1782 1783SDValue 1784X86TargetLowering::LowerMemArgument(SDValue Chain, 1785 CallingConv::ID CallConv, 1786 const SmallVectorImpl<ISD::InputArg> &Ins, 1787 DebugLoc dl, SelectionDAG &DAG, 1788 const CCValAssign &VA, 1789 MachineFrameInfo *MFI, 1790 unsigned i) const { 1791 // Create the nodes corresponding to a load from this parameter slot. 1792 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1793 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, 1794 getTargetMachine().Options.GuaranteedTailCallOpt); 1795 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1796 EVT ValVT; 1797 1798 // If value is passed by pointer we have address passed instead of the value 1799 // itself. 1800 if (VA.getLocInfo() == CCValAssign::Indirect) 1801 ValVT = VA.getLocVT(); 1802 else 1803 ValVT = VA.getValVT(); 1804 1805 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1806 // changed with more analysis. 1807 // In case of tail call optimization mark all arguments mutable. Since they 1808 // could be overwritten by lowering of arguments in case of a tail call. 1809 if (Flags.isByVal()) { 1810 unsigned Bytes = Flags.getByValSize(); 1811 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1812 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1813 return DAG.getFrameIndex(FI, getPointerTy()); 1814 } else { 1815 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1816 VA.getLocMemOffset(), isImmutable); 1817 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1818 return DAG.getLoad(ValVT, dl, Chain, FIN, 1819 MachinePointerInfo::getFixedStack(FI), 1820 false, false, false, 0); 1821 } 1822} 1823 1824SDValue 1825X86TargetLowering::LowerFormalArguments(SDValue Chain, 1826 CallingConv::ID CallConv, 1827 bool isVarArg, 1828 const SmallVectorImpl<ISD::InputArg> &Ins, 1829 DebugLoc dl, 1830 SelectionDAG &DAG, 1831 SmallVectorImpl<SDValue> &InVals) 1832 const { 1833 MachineFunction &MF = DAG.getMachineFunction(); 1834 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1835 1836 const Function* Fn = MF.getFunction(); 1837 if (Fn->hasExternalLinkage() && 1838 Subtarget->isTargetCygMing() && 1839 Fn->getName() == "main") 1840 FuncInfo->setForceFramePointer(true); 1841 1842 MachineFrameInfo *MFI = MF.getFrameInfo(); 1843 bool Is64Bit = Subtarget->is64Bit(); 1844 bool IsWindows = Subtarget->isTargetWindows(); 1845 bool IsWin64 = Subtarget->isTargetWin64(); 1846 1847 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1848 "Var args not supported with calling convention fastcc or ghc"); 1849 1850 // Assign locations to all of the incoming arguments. 1851 SmallVector<CCValAssign, 16> ArgLocs; 1852 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1853 ArgLocs, *DAG.getContext()); 1854 1855 // Allocate shadow area for Win64 1856 if (IsWin64) { 1857 CCInfo.AllocateStack(32, 8); 1858 } 1859 1860 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1861 1862 unsigned LastVal = ~0U; 1863 SDValue ArgValue; 1864 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1865 CCValAssign &VA = ArgLocs[i]; 1866 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1867 // places. 1868 assert(VA.getValNo() != LastVal && 1869 "Don't support value assigned to multiple locs yet"); 1870 (void)LastVal; 1871 LastVal = VA.getValNo(); 1872 1873 if (VA.isRegLoc()) { 1874 EVT RegVT = VA.getLocVT(); 1875 const TargetRegisterClass *RC; 1876 if (RegVT == MVT::i32) 1877 RC = &X86::GR32RegClass; 1878 else if (Is64Bit && RegVT == MVT::i64) 1879 RC = &X86::GR64RegClass; 1880 else if (RegVT == MVT::f32) 1881 RC = &X86::FR32RegClass; 1882 else if (RegVT == MVT::f64) 1883 RC = &X86::FR64RegClass; 1884 else if (RegVT.is256BitVector()) 1885 RC = &X86::VR256RegClass; 1886 else if (RegVT.is128BitVector()) 1887 RC = &X86::VR128RegClass; 1888 else if (RegVT == MVT::x86mmx) 1889 RC = &X86::VR64RegClass; 1890 else 1891 llvm_unreachable("Unknown argument type!"); 1892 1893 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1894 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1895 1896 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1897 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1898 // right size. 1899 if (VA.getLocInfo() == CCValAssign::SExt) 1900 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1901 DAG.getValueType(VA.getValVT())); 1902 else if (VA.getLocInfo() == CCValAssign::ZExt) 1903 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1904 DAG.getValueType(VA.getValVT())); 1905 else if (VA.getLocInfo() == CCValAssign::BCvt) 1906 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1907 1908 if (VA.isExtInLoc()) { 1909 // Handle MMX values passed in XMM regs. 1910 if (RegVT.isVector()) { 1911 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1912 ArgValue); 1913 } else 1914 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1915 } 1916 } else { 1917 assert(VA.isMemLoc()); 1918 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1919 } 1920 1921 // If value is passed via pointer - do a load. 1922 if (VA.getLocInfo() == CCValAssign::Indirect) 1923 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1924 MachinePointerInfo(), false, false, false, 0); 1925 1926 InVals.push_back(ArgValue); 1927 } 1928 1929 // The x86-64 ABI for returning structs by value requires that we copy 1930 // the sret argument into %rax for the return. Save the argument into 1931 // a virtual register so that we can access it from the return points. 1932 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1933 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1934 unsigned Reg = FuncInfo->getSRetReturnReg(); 1935 if (!Reg) { 1936 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1937 FuncInfo->setSRetReturnReg(Reg); 1938 } 1939 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1940 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1941 } 1942 1943 unsigned StackSize = CCInfo.getNextStackOffset(); 1944 // Align stack specially for tail calls. 1945 if (FuncIsMadeTailCallSafe(CallConv, 1946 MF.getTarget().Options.GuaranteedTailCallOpt)) 1947 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1948 1949 // If the function takes variable number of arguments, make a frame index for 1950 // the start of the first vararg value... for expansion of llvm.va_start. 1951 if (isVarArg) { 1952 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1953 CallConv != CallingConv::X86_ThisCall)) { 1954 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1955 } 1956 if (Is64Bit) { 1957 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1958 1959 // FIXME: We should really autogenerate these arrays 1960 static const uint16_t GPR64ArgRegsWin64[] = { 1961 X86::RCX, X86::RDX, X86::R8, X86::R9 1962 }; 1963 static const uint16_t GPR64ArgRegs64Bit[] = { 1964 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1965 }; 1966 static const uint16_t XMMArgRegs64Bit[] = { 1967 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1968 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1969 }; 1970 const uint16_t *GPR64ArgRegs; 1971 unsigned NumXMMRegs = 0; 1972 1973 if (IsWin64) { 1974 // The XMM registers which might contain var arg parameters are shadowed 1975 // in their paired GPR. So we only need to save the GPR to their home 1976 // slots. 1977 TotalNumIntRegs = 4; 1978 GPR64ArgRegs = GPR64ArgRegsWin64; 1979 } else { 1980 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1981 GPR64ArgRegs = GPR64ArgRegs64Bit; 1982 1983 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, 1984 TotalNumXMMRegs); 1985 } 1986 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1987 TotalNumIntRegs); 1988 1989 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1990 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1991 "SSE register cannot be used when SSE is disabled!"); 1992 assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && 1993 NoImplicitFloatOps) && 1994 "SSE register cannot be used when SSE is disabled!"); 1995 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 1996 !Subtarget->hasSSE1()) 1997 // Kernel mode asks for SSE to be disabled, so don't push them 1998 // on the stack. 1999 TotalNumXMMRegs = 0; 2000 2001 if (IsWin64) { 2002 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 2003 // Get to the caller-allocated home save location. Add 8 to account 2004 // for the return address. 2005 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 2006 FuncInfo->setRegSaveFrameIndex( 2007 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 2008 // Fixup to set vararg frame on shadow area (4 x i64). 2009 if (NumIntRegs < 4) 2010 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 2011 } else { 2012 // For X86-64, if there are vararg parameters that are passed via 2013 // registers, then we must store them to their spots on the stack so 2014 // they may be loaded by deferencing the result of va_next. 2015 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 2016 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 2017 FuncInfo->setRegSaveFrameIndex( 2018 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 2019 false)); 2020 } 2021 2022 // Store the integer parameter registers. 2023 SmallVector<SDValue, 8> MemOps; 2024 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 2025 getPointerTy()); 2026 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 2027 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 2028 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 2029 DAG.getIntPtrConstant(Offset)); 2030 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 2031 &X86::GR64RegClass); 2032 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 2033 SDValue Store = 2034 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2035 MachinePointerInfo::getFixedStack( 2036 FuncInfo->getRegSaveFrameIndex(), Offset), 2037 false, false, 0); 2038 MemOps.push_back(Store); 2039 Offset += 8; 2040 } 2041 2042 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 2043 // Now store the XMM (fp + vector) parameter registers. 2044 SmallVector<SDValue, 11> SaveXMMOps; 2045 SaveXMMOps.push_back(Chain); 2046 2047 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 2048 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 2049 SaveXMMOps.push_back(ALVal); 2050 2051 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2052 FuncInfo->getRegSaveFrameIndex())); 2053 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2054 FuncInfo->getVarArgsFPOffset())); 2055 2056 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 2057 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 2058 &X86::VR128RegClass); 2059 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 2060 SaveXMMOps.push_back(Val); 2061 } 2062 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2063 MVT::Other, 2064 &SaveXMMOps[0], SaveXMMOps.size())); 2065 } 2066 2067 if (!MemOps.empty()) 2068 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2069 &MemOps[0], MemOps.size()); 2070 } 2071 } 2072 2073 // Some CCs need callee pop. 2074 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2075 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2076 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2077 } else { 2078 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2079 // If this is an sret function, the return should pop the hidden pointer. 2080 if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2081 argsAreStructReturn(Ins) == StackStructReturn) 2082 FuncInfo->setBytesToPopOnReturn(4); 2083 } 2084 2085 if (!Is64Bit) { 2086 // RegSaveFrameIndex is X86-64 only. 2087 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2088 if (CallConv == CallingConv::X86_FastCall || 2089 CallConv == CallingConv::X86_ThisCall) 2090 // fastcc functions can't have varargs. 2091 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2092 } 2093 2094 FuncInfo->setArgumentStackSize(StackSize); 2095 2096 return Chain; 2097} 2098 2099SDValue 2100X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2101 SDValue StackPtr, SDValue Arg, 2102 DebugLoc dl, SelectionDAG &DAG, 2103 const CCValAssign &VA, 2104 ISD::ArgFlagsTy Flags) const { 2105 unsigned LocMemOffset = VA.getLocMemOffset(); 2106 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 2107 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 2108 if (Flags.isByVal()) 2109 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2110 2111 return DAG.getStore(Chain, dl, Arg, PtrOff, 2112 MachinePointerInfo::getStack(LocMemOffset), 2113 false, false, 0); 2114} 2115 2116/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 2117/// optimization is performed and it is required. 2118SDValue 2119X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 2120 SDValue &OutRetAddr, SDValue Chain, 2121 bool IsTailCall, bool Is64Bit, 2122 int FPDiff, DebugLoc dl) const { 2123 // Adjust the Return address stack slot. 2124 EVT VT = getPointerTy(); 2125 OutRetAddr = getReturnAddressFrameIndex(DAG); 2126 2127 // Load the "old" Return address. 2128 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2129 false, false, false, 0); 2130 return SDValue(OutRetAddr.getNode(), 1); 2131} 2132 2133/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 2134/// optimization is performed and it is required (FPDiff!=0). 2135static SDValue 2136EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 2137 SDValue Chain, SDValue RetAddrFrIdx, 2138 bool Is64Bit, int FPDiff, DebugLoc dl) { 2139 // Store the return address to the appropriate stack slot. 2140 if (!FPDiff) return Chain; 2141 // Calculate the new stack slot for the return address. 2142 int SlotSize = Is64Bit ? 8 : 4; 2143 int NewReturnAddrFI = 2144 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 2145 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 2146 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 2147 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2148 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 2149 false, false, 0); 2150 return Chain; 2151} 2152 2153SDValue 2154X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2155 SmallVectorImpl<SDValue> &InVals) const { 2156 SelectionDAG &DAG = CLI.DAG; 2157 DebugLoc &dl = CLI.DL; 2158 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2159 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2160 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2161 SDValue Chain = CLI.Chain; 2162 SDValue Callee = CLI.Callee; 2163 CallingConv::ID CallConv = CLI.CallConv; 2164 bool &isTailCall = CLI.IsTailCall; 2165 bool isVarArg = CLI.IsVarArg; 2166 2167 MachineFunction &MF = DAG.getMachineFunction(); 2168 bool Is64Bit = Subtarget->is64Bit(); 2169 bool IsWin64 = Subtarget->isTargetWin64(); 2170 bool IsWindows = Subtarget->isTargetWindows(); 2171 StructReturnType SR = callIsStructReturn(Outs); 2172 bool IsSibcall = false; 2173 2174 if (MF.getTarget().Options.DisableTailCalls) 2175 isTailCall = false; 2176 2177 if (isTailCall) { 2178 // Check if it's really possible to do a tail call. 2179 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2180 isVarArg, SR != NotStructReturn, 2181 MF.getFunction()->hasStructRetAttr(), 2182 Outs, OutVals, Ins, DAG); 2183 2184 // Sibcalls are automatically detected tailcalls which do not require 2185 // ABI changes. 2186 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 2187 IsSibcall = true; 2188 2189 if (isTailCall) 2190 ++NumTailCalls; 2191 } 2192 2193 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2194 "Var args not supported with calling convention fastcc or ghc"); 2195 2196 // Analyze operands of the call, assigning locations to each operand. 2197 SmallVector<CCValAssign, 16> ArgLocs; 2198 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2199 ArgLocs, *DAG.getContext()); 2200 2201 // Allocate shadow area for Win64 2202 if (IsWin64) { 2203 CCInfo.AllocateStack(32, 8); 2204 } 2205 2206 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2207 2208 // Get a count of how many bytes are to be pushed on the stack. 2209 unsigned NumBytes = CCInfo.getNextStackOffset(); 2210 if (IsSibcall) 2211 // This is a sibcall. The memory operands are available in caller's 2212 // own caller's stack. 2213 NumBytes = 0; 2214 else if (getTargetMachine().Options.GuaranteedTailCallOpt && 2215 IsTailCallConvention(CallConv)) 2216 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2217 2218 int FPDiff = 0; 2219 if (isTailCall && !IsSibcall) { 2220 // Lower arguments at fp - stackoffset + fpdiff. 2221 unsigned NumBytesCallerPushed = 2222 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 2223 FPDiff = NumBytesCallerPushed - NumBytes; 2224 2225 // Set the delta of movement of the returnaddr stackslot. 2226 // But only set if delta is greater than previous delta. 2227 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2228 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2229 } 2230 2231 if (!IsSibcall) 2232 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2233 2234 SDValue RetAddrFrIdx; 2235 // Load return address for tail calls. 2236 if (isTailCall && FPDiff) 2237 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2238 Is64Bit, FPDiff, dl); 2239 2240 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2241 SmallVector<SDValue, 8> MemOpChains; 2242 SDValue StackPtr; 2243 2244 // Walk the register/memloc assignments, inserting copies/loads. In the case 2245 // of tail call optimization arguments are handle later. 2246 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2247 CCValAssign &VA = ArgLocs[i]; 2248 EVT RegVT = VA.getLocVT(); 2249 SDValue Arg = OutVals[i]; 2250 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2251 bool isByVal = Flags.isByVal(); 2252 2253 // Promote the value if needed. 2254 switch (VA.getLocInfo()) { 2255 default: llvm_unreachable("Unknown loc info!"); 2256 case CCValAssign::Full: break; 2257 case CCValAssign::SExt: 2258 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2259 break; 2260 case CCValAssign::ZExt: 2261 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2262 break; 2263 case CCValAssign::AExt: 2264 if (RegVT.is128BitVector()) { 2265 // Special case: passing MMX values in XMM registers. 2266 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2267 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2268 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2269 } else 2270 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2271 break; 2272 case CCValAssign::BCvt: 2273 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2274 break; 2275 case CCValAssign::Indirect: { 2276 // Store the argument. 2277 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2278 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2279 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2280 MachinePointerInfo::getFixedStack(FI), 2281 false, false, 0); 2282 Arg = SpillSlot; 2283 break; 2284 } 2285 } 2286 2287 if (VA.isRegLoc()) { 2288 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2289 if (isVarArg && IsWin64) { 2290 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2291 // shadow reg if callee is a varargs function. 2292 unsigned ShadowReg = 0; 2293 switch (VA.getLocReg()) { 2294 case X86::XMM0: ShadowReg = X86::RCX; break; 2295 case X86::XMM1: ShadowReg = X86::RDX; break; 2296 case X86::XMM2: ShadowReg = X86::R8; break; 2297 case X86::XMM3: ShadowReg = X86::R9; break; 2298 } 2299 if (ShadowReg) 2300 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2301 } 2302 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2303 assert(VA.isMemLoc()); 2304 if (StackPtr.getNode() == 0) 2305 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2306 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2307 dl, DAG, VA, Flags)); 2308 } 2309 } 2310 2311 if (!MemOpChains.empty()) 2312 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2313 &MemOpChains[0], MemOpChains.size()); 2314 2315 if (Subtarget->isPICStyleGOT()) { 2316 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2317 // GOT pointer. 2318 if (!isTailCall) { 2319 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), 2320 DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()))); 2321 } else { 2322 // If we are tail calling and generating PIC/GOT style code load the 2323 // address of the callee into ECX. The value in ecx is used as target of 2324 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2325 // for tail calls on PIC/GOT architectures. Normally we would just put the 2326 // address of GOT into ebx and then call target@PLT. But for tail calls 2327 // ebx would be restored (since ebx is callee saved) before jumping to the 2328 // target@PLT. 2329 2330 // Note: The actual moving to ECX is done further down. 2331 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2332 if (G && !G->getGlobal()->hasHiddenVisibility() && 2333 !G->getGlobal()->hasProtectedVisibility()) 2334 Callee = LowerGlobalAddress(Callee, DAG); 2335 else if (isa<ExternalSymbolSDNode>(Callee)) 2336 Callee = LowerExternalSymbol(Callee, DAG); 2337 } 2338 } 2339 2340 if (Is64Bit && isVarArg && !IsWin64) { 2341 // From AMD64 ABI document: 2342 // For calls that may call functions that use varargs or stdargs 2343 // (prototype-less calls or calls to functions containing ellipsis (...) in 2344 // the declaration) %al is used as hidden argument to specify the number 2345 // of SSE registers used. The contents of %al do not need to match exactly 2346 // the number of registers, but must be an ubound on the number of SSE 2347 // registers used and is in the range 0 - 8 inclusive. 2348 2349 // Count the number of XMM registers allocated. 2350 static const uint16_t XMMArgRegs[] = { 2351 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2352 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2353 }; 2354 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2355 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2356 && "SSE registers cannot be used when SSE is disabled"); 2357 2358 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 2359 DAG.getConstant(NumXMMRegs, MVT::i8))); 2360 } 2361 2362 // For tail calls lower the arguments to the 'real' stack slot. 2363 if (isTailCall) { 2364 // Force all the incoming stack arguments to be loaded from the stack 2365 // before any new outgoing arguments are stored to the stack, because the 2366 // outgoing stack slots may alias the incoming argument stack slots, and 2367 // the alias isn't otherwise explicit. This is slightly more conservative 2368 // than necessary, because it means that each store effectively depends 2369 // on every argument instead of just those arguments it would clobber. 2370 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2371 2372 SmallVector<SDValue, 8> MemOpChains2; 2373 SDValue FIN; 2374 int FI = 0; 2375 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2376 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2377 CCValAssign &VA = ArgLocs[i]; 2378 if (VA.isRegLoc()) 2379 continue; 2380 assert(VA.isMemLoc()); 2381 SDValue Arg = OutVals[i]; 2382 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2383 // Create frame index. 2384 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2385 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2386 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2387 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2388 2389 if (Flags.isByVal()) { 2390 // Copy relative to framepointer. 2391 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2392 if (StackPtr.getNode() == 0) 2393 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2394 getPointerTy()); 2395 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2396 2397 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2398 ArgChain, 2399 Flags, DAG, dl)); 2400 } else { 2401 // Store relative to framepointer. 2402 MemOpChains2.push_back( 2403 DAG.getStore(ArgChain, dl, Arg, FIN, 2404 MachinePointerInfo::getFixedStack(FI), 2405 false, false, 0)); 2406 } 2407 } 2408 } 2409 2410 if (!MemOpChains2.empty()) 2411 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2412 &MemOpChains2[0], MemOpChains2.size()); 2413 2414 // Store the return address to the appropriate stack slot. 2415 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2416 FPDiff, dl); 2417 } 2418 2419 // Build a sequence of copy-to-reg nodes chained together with token chain 2420 // and flag operands which copy the outgoing args into registers. 2421 SDValue InFlag; 2422 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2423 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2424 RegsToPass[i].second, InFlag); 2425 InFlag = Chain.getValue(1); 2426 } 2427 2428 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2429 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2430 // In the 64-bit large code model, we have to make all calls 2431 // through a register, since the call instruction's 32-bit 2432 // pc-relative offset may not be large enough to hold the whole 2433 // address. 2434 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2435 // If the callee is a GlobalAddress node (quite common, every direct call 2436 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2437 // it. 2438 2439 // We should use extra load for direct calls to dllimported functions in 2440 // non-JIT mode. 2441 const GlobalValue *GV = G->getGlobal(); 2442 if (!GV->hasDLLImportLinkage()) { 2443 unsigned char OpFlags = 0; 2444 bool ExtraLoad = false; 2445 unsigned WrapperKind = ISD::DELETED_NODE; 2446 2447 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2448 // external symbols most go through the PLT in PIC mode. If the symbol 2449 // has hidden or protected visibility, or if it is static or local, then 2450 // we don't need to use the PLT - we can directly call it. 2451 if (Subtarget->isTargetELF() && 2452 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2453 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2454 OpFlags = X86II::MO_PLT; 2455 } else if (Subtarget->isPICStyleStubAny() && 2456 (GV->isDeclaration() || GV->isWeakForLinker()) && 2457 (!Subtarget->getTargetTriple().isMacOSX() || 2458 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2459 // PC-relative references to external symbols should go through $stub, 2460 // unless we're building with the leopard linker or later, which 2461 // automatically synthesizes these stubs. 2462 OpFlags = X86II::MO_DARWIN_STUB; 2463 } else if (Subtarget->isPICStyleRIPRel() && 2464 isa<Function>(GV) && 2465 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { 2466 // If the function is marked as non-lazy, generate an indirect call 2467 // which loads from the GOT directly. This avoids runtime overhead 2468 // at the cost of eager binding (and one extra byte of encoding). 2469 OpFlags = X86II::MO_GOTPCREL; 2470 WrapperKind = X86ISD::WrapperRIP; 2471 ExtraLoad = true; 2472 } 2473 2474 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2475 G->getOffset(), OpFlags); 2476 2477 // Add a wrapper if needed. 2478 if (WrapperKind != ISD::DELETED_NODE) 2479 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2480 // Add extra indirection if needed. 2481 if (ExtraLoad) 2482 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2483 MachinePointerInfo::getGOT(), 2484 false, false, false, 0); 2485 } 2486 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2487 unsigned char OpFlags = 0; 2488 2489 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2490 // external symbols should go through the PLT. 2491 if (Subtarget->isTargetELF() && 2492 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2493 OpFlags = X86II::MO_PLT; 2494 } else if (Subtarget->isPICStyleStubAny() && 2495 (!Subtarget->getTargetTriple().isMacOSX() || 2496 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2497 // PC-relative references to external symbols should go through $stub, 2498 // unless we're building with the leopard linker or later, which 2499 // automatically synthesizes these stubs. 2500 OpFlags = X86II::MO_DARWIN_STUB; 2501 } 2502 2503 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2504 OpFlags); 2505 } 2506 2507 // Returns a chain & a flag for retval copy to use. 2508 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2509 SmallVector<SDValue, 8> Ops; 2510 2511 if (!IsSibcall && isTailCall) { 2512 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2513 DAG.getIntPtrConstant(0, true), InFlag); 2514 InFlag = Chain.getValue(1); 2515 } 2516 2517 Ops.push_back(Chain); 2518 Ops.push_back(Callee); 2519 2520 if (isTailCall) 2521 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2522 2523 // Add argument registers to the end of the list so that they are known live 2524 // into the call. 2525 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2526 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2527 RegsToPass[i].second.getValueType())); 2528 2529 // Add a register mask operand representing the call-preserved registers. 2530 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 2531 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 2532 assert(Mask && "Missing call preserved mask for calling convention"); 2533 Ops.push_back(DAG.getRegisterMask(Mask)); 2534 2535 if (InFlag.getNode()) 2536 Ops.push_back(InFlag); 2537 2538 if (isTailCall) { 2539 // We used to do: 2540 //// If this is the first return lowered for this function, add the regs 2541 //// to the liveout set for the function. 2542 // This isn't right, although it's probably harmless on x86; liveouts 2543 // should be computed from returns not tail calls. Consider a void 2544 // function making a tail call to a function returning int. 2545 return DAG.getNode(X86ISD::TC_RETURN, dl, 2546 NodeTys, &Ops[0], Ops.size()); 2547 } 2548 2549 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2550 InFlag = Chain.getValue(1); 2551 2552 // Create the CALLSEQ_END node. 2553 unsigned NumBytesForCalleeToPush; 2554 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2555 getTargetMachine().Options.GuaranteedTailCallOpt)) 2556 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2557 else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2558 SR == StackStructReturn) 2559 // If this is a call to a struct-return function, the callee 2560 // pops the hidden struct pointer, so we have to push it back. 2561 // This is common for Darwin/X86, Linux & Mingw32 targets. 2562 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 2563 NumBytesForCalleeToPush = 4; 2564 else 2565 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2566 2567 // Returns a flag for retval copy to use. 2568 if (!IsSibcall) { 2569 Chain = DAG.getCALLSEQ_END(Chain, 2570 DAG.getIntPtrConstant(NumBytes, true), 2571 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2572 true), 2573 InFlag); 2574 InFlag = Chain.getValue(1); 2575 } 2576 2577 // Handle result values, copying them out of physregs into vregs that we 2578 // return. 2579 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2580 Ins, dl, DAG, InVals); 2581} 2582 2583 2584//===----------------------------------------------------------------------===// 2585// Fast Calling Convention (tail call) implementation 2586//===----------------------------------------------------------------------===// 2587 2588// Like std call, callee cleans arguments, convention except that ECX is 2589// reserved for storing the tail called function address. Only 2 registers are 2590// free for argument passing (inreg). Tail call optimization is performed 2591// provided: 2592// * tailcallopt is enabled 2593// * caller/callee are fastcc 2594// On X86_64 architecture with GOT-style position independent code only local 2595// (within module) calls are supported at the moment. 2596// To keep the stack aligned according to platform abi the function 2597// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2598// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2599// If a tail called function callee has more arguments than the caller the 2600// caller needs to make sure that there is room to move the RETADDR to. This is 2601// achieved by reserving an area the size of the argument delta right after the 2602// original REtADDR, but before the saved framepointer or the spilled registers 2603// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2604// stack layout: 2605// arg1 2606// arg2 2607// RETADDR 2608// [ new RETADDR 2609// move area ] 2610// (possible EBP) 2611// ESI 2612// EDI 2613// local1 .. 2614 2615/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2616/// for a 16 byte align requirement. 2617unsigned 2618X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2619 SelectionDAG& DAG) const { 2620 MachineFunction &MF = DAG.getMachineFunction(); 2621 const TargetMachine &TM = MF.getTarget(); 2622 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2623 unsigned StackAlignment = TFI.getStackAlignment(); 2624 uint64_t AlignMask = StackAlignment - 1; 2625 int64_t Offset = StackSize; 2626 uint64_t SlotSize = TD->getPointerSize(); 2627 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2628 // Number smaller than 12 so just add the difference. 2629 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2630 } else { 2631 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2632 Offset = ((~AlignMask) & Offset) + StackAlignment + 2633 (StackAlignment-SlotSize); 2634 } 2635 return Offset; 2636} 2637 2638/// MatchingStackOffset - Return true if the given stack call argument is 2639/// already available in the same position (relatively) of the caller's 2640/// incoming argument stack. 2641static 2642bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2643 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2644 const X86InstrInfo *TII) { 2645 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2646 int FI = INT_MAX; 2647 if (Arg.getOpcode() == ISD::CopyFromReg) { 2648 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2649 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2650 return false; 2651 MachineInstr *Def = MRI->getVRegDef(VR); 2652 if (!Def) 2653 return false; 2654 if (!Flags.isByVal()) { 2655 if (!TII->isLoadFromStackSlot(Def, FI)) 2656 return false; 2657 } else { 2658 unsigned Opcode = Def->getOpcode(); 2659 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2660 Def->getOperand(1).isFI()) { 2661 FI = Def->getOperand(1).getIndex(); 2662 Bytes = Flags.getByValSize(); 2663 } else 2664 return false; 2665 } 2666 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2667 if (Flags.isByVal()) 2668 // ByVal argument is passed in as a pointer but it's now being 2669 // dereferenced. e.g. 2670 // define @foo(%struct.X* %A) { 2671 // tail call @bar(%struct.X* byval %A) 2672 // } 2673 return false; 2674 SDValue Ptr = Ld->getBasePtr(); 2675 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2676 if (!FINode) 2677 return false; 2678 FI = FINode->getIndex(); 2679 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2680 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2681 FI = FINode->getIndex(); 2682 Bytes = Flags.getByValSize(); 2683 } else 2684 return false; 2685 2686 assert(FI != INT_MAX); 2687 if (!MFI->isFixedObjectIndex(FI)) 2688 return false; 2689 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2690} 2691 2692/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2693/// for tail call optimization. Targets which want to do tail call 2694/// optimization should implement this function. 2695bool 2696X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2697 CallingConv::ID CalleeCC, 2698 bool isVarArg, 2699 bool isCalleeStructRet, 2700 bool isCallerStructRet, 2701 const SmallVectorImpl<ISD::OutputArg> &Outs, 2702 const SmallVectorImpl<SDValue> &OutVals, 2703 const SmallVectorImpl<ISD::InputArg> &Ins, 2704 SelectionDAG& DAG) const { 2705 if (!IsTailCallConvention(CalleeCC) && 2706 CalleeCC != CallingConv::C) 2707 return false; 2708 2709 // If -tailcallopt is specified, make fastcc functions tail-callable. 2710 const MachineFunction &MF = DAG.getMachineFunction(); 2711 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2712 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2713 bool CCMatch = CallerCC == CalleeCC; 2714 2715 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2716 if (IsTailCallConvention(CalleeCC) && CCMatch) 2717 return true; 2718 return false; 2719 } 2720 2721 // Look for obvious safe cases to perform tail call optimization that do not 2722 // require ABI changes. This is what gcc calls sibcall. 2723 2724 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2725 // emit a special epilogue. 2726 if (RegInfo->needsStackRealignment(MF)) 2727 return false; 2728 2729 // Also avoid sibcall optimization if either caller or callee uses struct 2730 // return semantics. 2731 if (isCalleeStructRet || isCallerStructRet) 2732 return false; 2733 2734 // An stdcall caller is expected to clean up its arguments; the callee 2735 // isn't going to do that. 2736 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2737 return false; 2738 2739 // Do not sibcall optimize vararg calls unless all arguments are passed via 2740 // registers. 2741 if (isVarArg && !Outs.empty()) { 2742 2743 // Optimizing for varargs on Win64 is unlikely to be safe without 2744 // additional testing. 2745 if (Subtarget->isTargetWin64()) 2746 return false; 2747 2748 SmallVector<CCValAssign, 16> ArgLocs; 2749 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2750 getTargetMachine(), ArgLocs, *DAG.getContext()); 2751 2752 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2753 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2754 if (!ArgLocs[i].isRegLoc()) 2755 return false; 2756 } 2757 2758 // If the call result is in ST0 / ST1, it needs to be popped off the x87 2759 // stack. Therefore, if it's not used by the call it is not safe to optimize 2760 // this into a sibcall. 2761 bool Unused = false; 2762 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2763 if (!Ins[i].Used) { 2764 Unused = true; 2765 break; 2766 } 2767 } 2768 if (Unused) { 2769 SmallVector<CCValAssign, 16> RVLocs; 2770 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2771 getTargetMachine(), RVLocs, *DAG.getContext()); 2772 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2773 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2774 CCValAssign &VA = RVLocs[i]; 2775 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2776 return false; 2777 } 2778 } 2779 2780 // If the calling conventions do not match, then we'd better make sure the 2781 // results are returned in the same way as what the caller expects. 2782 if (!CCMatch) { 2783 SmallVector<CCValAssign, 16> RVLocs1; 2784 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2785 getTargetMachine(), RVLocs1, *DAG.getContext()); 2786 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2787 2788 SmallVector<CCValAssign, 16> RVLocs2; 2789 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2790 getTargetMachine(), RVLocs2, *DAG.getContext()); 2791 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2792 2793 if (RVLocs1.size() != RVLocs2.size()) 2794 return false; 2795 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2796 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2797 return false; 2798 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2799 return false; 2800 if (RVLocs1[i].isRegLoc()) { 2801 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2802 return false; 2803 } else { 2804 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2805 return false; 2806 } 2807 } 2808 } 2809 2810 // If the callee takes no arguments then go on to check the results of the 2811 // call. 2812 if (!Outs.empty()) { 2813 // Check if stack adjustment is needed. For now, do not do this if any 2814 // argument is passed on the stack. 2815 SmallVector<CCValAssign, 16> ArgLocs; 2816 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2817 getTargetMachine(), ArgLocs, *DAG.getContext()); 2818 2819 // Allocate shadow area for Win64 2820 if (Subtarget->isTargetWin64()) { 2821 CCInfo.AllocateStack(32, 8); 2822 } 2823 2824 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2825 if (CCInfo.getNextStackOffset()) { 2826 MachineFunction &MF = DAG.getMachineFunction(); 2827 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2828 return false; 2829 2830 // Check if the arguments are already laid out in the right way as 2831 // the caller's fixed stack objects. 2832 MachineFrameInfo *MFI = MF.getFrameInfo(); 2833 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2834 const X86InstrInfo *TII = 2835 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2836 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2837 CCValAssign &VA = ArgLocs[i]; 2838 SDValue Arg = OutVals[i]; 2839 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2840 if (VA.getLocInfo() == CCValAssign::Indirect) 2841 return false; 2842 if (!VA.isRegLoc()) { 2843 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2844 MFI, MRI, TII)) 2845 return false; 2846 } 2847 } 2848 } 2849 2850 // If the tailcall address may be in a register, then make sure it's 2851 // possible to register allocate for it. In 32-bit, the call address can 2852 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2853 // callee-saved registers are restored. These happen to be the same 2854 // registers used to pass 'inreg' arguments so watch out for those. 2855 if (!Subtarget->is64Bit() && 2856 !isa<GlobalAddressSDNode>(Callee) && 2857 !isa<ExternalSymbolSDNode>(Callee)) { 2858 unsigned NumInRegs = 0; 2859 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2860 CCValAssign &VA = ArgLocs[i]; 2861 if (!VA.isRegLoc()) 2862 continue; 2863 unsigned Reg = VA.getLocReg(); 2864 switch (Reg) { 2865 default: break; 2866 case X86::EAX: case X86::EDX: case X86::ECX: 2867 if (++NumInRegs == 3) 2868 return false; 2869 break; 2870 } 2871 } 2872 } 2873 } 2874 2875 return true; 2876} 2877 2878FastISel * 2879X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 2880 const TargetLibraryInfo *libInfo) const { 2881 return X86::createFastISel(funcInfo, libInfo); 2882} 2883 2884 2885//===----------------------------------------------------------------------===// 2886// Other Lowering Hooks 2887//===----------------------------------------------------------------------===// 2888 2889static bool MayFoldLoad(SDValue Op) { 2890 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2891} 2892 2893static bool MayFoldIntoStore(SDValue Op) { 2894 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2895} 2896 2897static bool isTargetShuffle(unsigned Opcode) { 2898 switch(Opcode) { 2899 default: return false; 2900 case X86ISD::PSHUFD: 2901 case X86ISD::PSHUFHW: 2902 case X86ISD::PSHUFLW: 2903 case X86ISD::SHUFP: 2904 case X86ISD::PALIGN: 2905 case X86ISD::MOVLHPS: 2906 case X86ISD::MOVLHPD: 2907 case X86ISD::MOVHLPS: 2908 case X86ISD::MOVLPS: 2909 case X86ISD::MOVLPD: 2910 case X86ISD::MOVSHDUP: 2911 case X86ISD::MOVSLDUP: 2912 case X86ISD::MOVDDUP: 2913 case X86ISD::MOVSS: 2914 case X86ISD::MOVSD: 2915 case X86ISD::UNPCKL: 2916 case X86ISD::UNPCKH: 2917 case X86ISD::VPERMILP: 2918 case X86ISD::VPERM2X128: 2919 case X86ISD::VPERMI: 2920 return true; 2921 } 2922} 2923 2924static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2925 SDValue V1, SelectionDAG &DAG) { 2926 switch(Opc) { 2927 default: llvm_unreachable("Unknown x86 shuffle node"); 2928 case X86ISD::MOVSHDUP: 2929 case X86ISD::MOVSLDUP: 2930 case X86ISD::MOVDDUP: 2931 return DAG.getNode(Opc, dl, VT, V1); 2932 } 2933} 2934 2935static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2936 SDValue V1, unsigned TargetMask, 2937 SelectionDAG &DAG) { 2938 switch(Opc) { 2939 default: llvm_unreachable("Unknown x86 shuffle node"); 2940 case X86ISD::PSHUFD: 2941 case X86ISD::PSHUFHW: 2942 case X86ISD::PSHUFLW: 2943 case X86ISD::VPERMILP: 2944 case X86ISD::VPERMI: 2945 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2946 } 2947} 2948 2949static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2950 SDValue V1, SDValue V2, unsigned TargetMask, 2951 SelectionDAG &DAG) { 2952 switch(Opc) { 2953 default: llvm_unreachable("Unknown x86 shuffle node"); 2954 case X86ISD::PALIGN: 2955 case X86ISD::SHUFP: 2956 case X86ISD::VPERM2X128: 2957 return DAG.getNode(Opc, dl, VT, V1, V2, 2958 DAG.getConstant(TargetMask, MVT::i8)); 2959 } 2960} 2961 2962static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2963 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2964 switch(Opc) { 2965 default: llvm_unreachable("Unknown x86 shuffle node"); 2966 case X86ISD::MOVLHPS: 2967 case X86ISD::MOVLHPD: 2968 case X86ISD::MOVHLPS: 2969 case X86ISD::MOVLPS: 2970 case X86ISD::MOVLPD: 2971 case X86ISD::MOVSS: 2972 case X86ISD::MOVSD: 2973 case X86ISD::UNPCKL: 2974 case X86ISD::UNPCKH: 2975 return DAG.getNode(Opc, dl, VT, V1, V2); 2976 } 2977} 2978 2979SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2980 MachineFunction &MF = DAG.getMachineFunction(); 2981 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2982 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2983 2984 if (ReturnAddrIndex == 0) { 2985 // Set up a frame object for the return address. 2986 uint64_t SlotSize = TD->getPointerSize(); 2987 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2988 false); 2989 FuncInfo->setRAIndex(ReturnAddrIndex); 2990 } 2991 2992 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2993} 2994 2995 2996bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2997 bool hasSymbolicDisplacement) { 2998 // Offset should fit into 32 bit immediate field. 2999 if (!isInt<32>(Offset)) 3000 return false; 3001 3002 // If we don't have a symbolic displacement - we don't have any extra 3003 // restrictions. 3004 if (!hasSymbolicDisplacement) 3005 return true; 3006 3007 // FIXME: Some tweaks might be needed for medium code model. 3008 if (M != CodeModel::Small && M != CodeModel::Kernel) 3009 return false; 3010 3011 // For small code model we assume that latest object is 16MB before end of 31 3012 // bits boundary. We may also accept pretty large negative constants knowing 3013 // that all objects are in the positive half of address space. 3014 if (M == CodeModel::Small && Offset < 16*1024*1024) 3015 return true; 3016 3017 // For kernel code model we know that all object resist in the negative half 3018 // of 32bits address space. We may not accept negative offsets, since they may 3019 // be just off and we may accept pretty large positive ones. 3020 if (M == CodeModel::Kernel && Offset > 0) 3021 return true; 3022 3023 return false; 3024} 3025 3026/// isCalleePop - Determines whether the callee is required to pop its 3027/// own arguments. Callee pop is necessary to support tail calls. 3028bool X86::isCalleePop(CallingConv::ID CallingConv, 3029 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 3030 if (IsVarArg) 3031 return false; 3032 3033 switch (CallingConv) { 3034 default: 3035 return false; 3036 case CallingConv::X86_StdCall: 3037 return !is64Bit; 3038 case CallingConv::X86_FastCall: 3039 return !is64Bit; 3040 case CallingConv::X86_ThisCall: 3041 return !is64Bit; 3042 case CallingConv::Fast: 3043 return TailCallOpt; 3044 case CallingConv::GHC: 3045 return TailCallOpt; 3046 } 3047} 3048 3049/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 3050/// specific condition code, returning the condition code and the LHS/RHS of the 3051/// comparison to make. 3052static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 3053 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 3054 if (!isFP) { 3055 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3056 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3057 // X > -1 -> X == 0, jump !sign. 3058 RHS = DAG.getConstant(0, RHS.getValueType()); 3059 return X86::COND_NS; 3060 } 3061 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3062 // X < 0 -> X == 0, jump on sign. 3063 return X86::COND_S; 3064 } 3065 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3066 // X < 1 -> X <= 0 3067 RHS = DAG.getConstant(0, RHS.getValueType()); 3068 return X86::COND_LE; 3069 } 3070 } 3071 3072 switch (SetCCOpcode) { 3073 default: llvm_unreachable("Invalid integer condition!"); 3074 case ISD::SETEQ: return X86::COND_E; 3075 case ISD::SETGT: return X86::COND_G; 3076 case ISD::SETGE: return X86::COND_GE; 3077 case ISD::SETLT: return X86::COND_L; 3078 case ISD::SETLE: return X86::COND_LE; 3079 case ISD::SETNE: return X86::COND_NE; 3080 case ISD::SETULT: return X86::COND_B; 3081 case ISD::SETUGT: return X86::COND_A; 3082 case ISD::SETULE: return X86::COND_BE; 3083 case ISD::SETUGE: return X86::COND_AE; 3084 } 3085 } 3086 3087 // First determine if it is required or is profitable to flip the operands. 3088 3089 // If LHS is a foldable load, but RHS is not, flip the condition. 3090 if (ISD::isNON_EXTLoad(LHS.getNode()) && 3091 !ISD::isNON_EXTLoad(RHS.getNode())) { 3092 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 3093 std::swap(LHS, RHS); 3094 } 3095 3096 switch (SetCCOpcode) { 3097 default: break; 3098 case ISD::SETOLT: 3099 case ISD::SETOLE: 3100 case ISD::SETUGT: 3101 case ISD::SETUGE: 3102 std::swap(LHS, RHS); 3103 break; 3104 } 3105 3106 // On a floating point condition, the flags are set as follows: 3107 // ZF PF CF op 3108 // 0 | 0 | 0 | X > Y 3109 // 0 | 0 | 1 | X < Y 3110 // 1 | 0 | 0 | X == Y 3111 // 1 | 1 | 1 | unordered 3112 switch (SetCCOpcode) { 3113 default: llvm_unreachable("Condcode should be pre-legalized away"); 3114 case ISD::SETUEQ: 3115 case ISD::SETEQ: return X86::COND_E; 3116 case ISD::SETOLT: // flipped 3117 case ISD::SETOGT: 3118 case ISD::SETGT: return X86::COND_A; 3119 case ISD::SETOLE: // flipped 3120 case ISD::SETOGE: 3121 case ISD::SETGE: return X86::COND_AE; 3122 case ISD::SETUGT: // flipped 3123 case ISD::SETULT: 3124 case ISD::SETLT: return X86::COND_B; 3125 case ISD::SETUGE: // flipped 3126 case ISD::SETULE: 3127 case ISD::SETLE: return X86::COND_BE; 3128 case ISD::SETONE: 3129 case ISD::SETNE: return X86::COND_NE; 3130 case ISD::SETUO: return X86::COND_P; 3131 case ISD::SETO: return X86::COND_NP; 3132 case ISD::SETOEQ: 3133 case ISD::SETUNE: return X86::COND_INVALID; 3134 } 3135} 3136 3137/// hasFPCMov - is there a floating point cmov for the specific X86 condition 3138/// code. Current x86 isa includes the following FP cmov instructions: 3139/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3140static bool hasFPCMov(unsigned X86CC) { 3141 switch (X86CC) { 3142 default: 3143 return false; 3144 case X86::COND_B: 3145 case X86::COND_BE: 3146 case X86::COND_E: 3147 case X86::COND_P: 3148 case X86::COND_A: 3149 case X86::COND_AE: 3150 case X86::COND_NE: 3151 case X86::COND_NP: 3152 return true; 3153 } 3154} 3155 3156/// isFPImmLegal - Returns true if the target can instruction select the 3157/// specified FP immediate natively. If false, the legalizer will 3158/// materialize the FP immediate as a load from a constant pool. 3159bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3160 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3161 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3162 return true; 3163 } 3164 return false; 3165} 3166 3167/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3168/// the specified range (L, H]. 3169static bool isUndefOrInRange(int Val, int Low, int Hi) { 3170 return (Val < 0) || (Val >= Low && Val < Hi); 3171} 3172 3173/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3174/// specified value. 3175static bool isUndefOrEqual(int Val, int CmpVal) { 3176 if (Val < 0 || Val == CmpVal) 3177 return true; 3178 return false; 3179} 3180 3181/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning 3182/// from position Pos and ending in Pos+Size, falls within the specified 3183/// sequential range (L, L+Pos]. or is undef. 3184static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 3185 unsigned Pos, unsigned Size, int Low) { 3186 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3187 if (!isUndefOrEqual(Mask[i], Low)) 3188 return false; 3189 return true; 3190} 3191 3192/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3193/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3194/// the second operand. 3195static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { 3196 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3197 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3198 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3199 return (Mask[0] < 2 && Mask[1] < 2); 3200 return false; 3201} 3202 3203/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3204/// is suitable for input to PSHUFHW. 3205static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { 3206 if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16)) 3207 return false; 3208 3209 // Lower quadword copied in order or undef. 3210 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) 3211 return false; 3212 3213 // Upper quadword shuffled. 3214 for (unsigned i = 4; i != 8; ++i) 3215 if (!isUndefOrInRange(Mask[i], 4, 8)) 3216 return false; 3217 3218 if (VT == MVT::v16i16) { 3219 // Lower quadword copied in order or undef. 3220 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) 3221 return false; 3222 3223 // Upper quadword shuffled. 3224 for (unsigned i = 12; i != 16; ++i) 3225 if (!isUndefOrInRange(Mask[i], 12, 16)) 3226 return false; 3227 } 3228 3229 return true; 3230} 3231 3232/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3233/// is suitable for input to PSHUFLW. 3234static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { 3235 if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16)) 3236 return false; 3237 3238 // Upper quadword copied in order. 3239 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) 3240 return false; 3241 3242 // Lower quadword shuffled. 3243 for (unsigned i = 0; i != 4; ++i) 3244 if (!isUndefOrInRange(Mask[i], 0, 4)) 3245 return false; 3246 3247 if (VT == MVT::v16i16) { 3248 // Upper quadword copied in order. 3249 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) 3250 return false; 3251 3252 // Lower quadword shuffled. 3253 for (unsigned i = 8; i != 12; ++i) 3254 if (!isUndefOrInRange(Mask[i], 8, 12)) 3255 return false; 3256 } 3257 3258 return true; 3259} 3260 3261/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3262/// is suitable for input to PALIGNR. 3263static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, 3264 const X86Subtarget *Subtarget) { 3265 if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) || 3266 (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())) 3267 return false; 3268 3269 unsigned NumElts = VT.getVectorNumElements(); 3270 unsigned NumLanes = VT.getSizeInBits()/128; 3271 unsigned NumLaneElts = NumElts/NumLanes; 3272 3273 // Do not handle 64-bit element shuffles with palignr. 3274 if (NumLaneElts == 2) 3275 return false; 3276 3277 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { 3278 unsigned i; 3279 for (i = 0; i != NumLaneElts; ++i) { 3280 if (Mask[i+l] >= 0) 3281 break; 3282 } 3283 3284 // Lane is all undef, go to next lane 3285 if (i == NumLaneElts) 3286 continue; 3287 3288 int Start = Mask[i+l]; 3289 3290 // Make sure its in this lane in one of the sources 3291 if (!isUndefOrInRange(Start, l, l+NumLaneElts) && 3292 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) 3293 return false; 3294 3295 // If not lane 0, then we must match lane 0 3296 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) 3297 return false; 3298 3299 // Correct second source to be contiguous with first source 3300 if (Start >= (int)NumElts) 3301 Start -= NumElts - NumLaneElts; 3302 3303 // Make sure we're shifting in the right direction. 3304 if (Start <= (int)(i+l)) 3305 return false; 3306 3307 Start -= i; 3308 3309 // Check the rest of the elements to see if they are consecutive. 3310 for (++i; i != NumLaneElts; ++i) { 3311 int Idx = Mask[i+l]; 3312 3313 // Make sure its in this lane 3314 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && 3315 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) 3316 return false; 3317 3318 // If not lane 0, then we must match lane 0 3319 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) 3320 return false; 3321 3322 if (Idx >= (int)NumElts) 3323 Idx -= NumElts - NumLaneElts; 3324 3325 if (!isUndefOrEqual(Idx, Start+i)) 3326 return false; 3327 3328 } 3329 } 3330 3331 return true; 3332} 3333 3334/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3335/// the two vector operands have swapped position. 3336static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, 3337 unsigned NumElems) { 3338 for (unsigned i = 0; i != NumElems; ++i) { 3339 int idx = Mask[i]; 3340 if (idx < 0) 3341 continue; 3342 else if (idx < (int)NumElems) 3343 Mask[i] = idx + NumElems; 3344 else 3345 Mask[i] = idx - NumElems; 3346 } 3347} 3348 3349/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3350/// specifies a shuffle of elements that is suitable for input to 128/256-bit 3351/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be 3352/// reverse of what x86 shuffles want. 3353static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX, 3354 bool Commuted = false) { 3355 if (!HasAVX && VT.getSizeInBits() == 256) 3356 return false; 3357 3358 unsigned NumElems = VT.getVectorNumElements(); 3359 unsigned NumLanes = VT.getSizeInBits()/128; 3360 unsigned NumLaneElems = NumElems/NumLanes; 3361 3362 if (NumLaneElems != 2 && NumLaneElems != 4) 3363 return false; 3364 3365 // VSHUFPSY divides the resulting vector into 4 chunks. 3366 // The sources are also splitted into 4 chunks, and each destination 3367 // chunk must come from a different source chunk. 3368 // 3369 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 3370 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 3371 // 3372 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 3373 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 3374 // 3375 // VSHUFPDY divides the resulting vector into 4 chunks. 3376 // The sources are also splitted into 4 chunks, and each destination 3377 // chunk must come from a different source chunk. 3378 // 3379 // SRC1 => X3 X2 X1 X0 3380 // SRC2 => Y3 Y2 Y1 Y0 3381 // 3382 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 3383 // 3384 unsigned HalfLaneElems = NumLaneElems/2; 3385 for (unsigned l = 0; l != NumElems; l += NumLaneElems) { 3386 for (unsigned i = 0; i != NumLaneElems; ++i) { 3387 int Idx = Mask[i+l]; 3388 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); 3389 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) 3390 return false; 3391 // For VSHUFPSY, the mask of the second half must be the same as the 3392 // first but with the appropriate offsets. This works in the same way as 3393 // VPERMILPS works with masks. 3394 if (NumElems != 8 || l == 0 || Mask[i] < 0) 3395 continue; 3396 if (!isUndefOrEqual(Idx, Mask[i]+l)) 3397 return false; 3398 } 3399 } 3400 3401 return true; 3402} 3403 3404/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3405/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3406static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { 3407 if (!VT.is128BitVector()) 3408 return false; 3409 3410 unsigned NumElems = VT.getVectorNumElements(); 3411 3412 if (NumElems != 4) 3413 return false; 3414 3415 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3416 return isUndefOrEqual(Mask[0], 6) && 3417 isUndefOrEqual(Mask[1], 7) && 3418 isUndefOrEqual(Mask[2], 2) && 3419 isUndefOrEqual(Mask[3], 3); 3420} 3421 3422/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3423/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3424/// <2, 3, 2, 3> 3425static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { 3426 if (!VT.is128BitVector()) 3427 return false; 3428 3429 unsigned NumElems = VT.getVectorNumElements(); 3430 3431 if (NumElems != 4) 3432 return false; 3433 3434 return isUndefOrEqual(Mask[0], 2) && 3435 isUndefOrEqual(Mask[1], 3) && 3436 isUndefOrEqual(Mask[2], 2) && 3437 isUndefOrEqual(Mask[3], 3); 3438} 3439 3440/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3441/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3442static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { 3443 if (!VT.is128BitVector()) 3444 return false; 3445 3446 unsigned NumElems = VT.getVectorNumElements(); 3447 3448 if (NumElems != 2 && NumElems != 4) 3449 return false; 3450 3451 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3452 if (!isUndefOrEqual(Mask[i], i + NumElems)) 3453 return false; 3454 3455 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 3456 if (!isUndefOrEqual(Mask[i], i)) 3457 return false; 3458 3459 return true; 3460} 3461 3462/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3463/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3464static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { 3465 if (!VT.is128BitVector()) 3466 return false; 3467 3468 unsigned NumElems = VT.getVectorNumElements(); 3469 3470 if (NumElems != 2 && NumElems != 4) 3471 return false; 3472 3473 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3474 if (!isUndefOrEqual(Mask[i], i)) 3475 return false; 3476 3477 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3478 if (!isUndefOrEqual(Mask[i + e], i + NumElems)) 3479 return false; 3480 3481 return true; 3482} 3483 3484// 3485// Some special combinations that can be optimized. 3486// 3487static 3488SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, 3489 SelectionDAG &DAG) { 3490 EVT VT = SVOp->getValueType(0); 3491 DebugLoc dl = SVOp->getDebugLoc(); 3492 3493 if (VT != MVT::v8i32 && VT != MVT::v8f32) 3494 return SDValue(); 3495 3496 ArrayRef<int> Mask = SVOp->getMask(); 3497 3498 // These are the special masks that may be optimized. 3499 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; 3500 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; 3501 bool MatchEvenMask = true; 3502 bool MatchOddMask = true; 3503 for (int i=0; i<8; ++i) { 3504 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) 3505 MatchEvenMask = false; 3506 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) 3507 MatchOddMask = false; 3508 } 3509 static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1}; 3510 static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1}; 3511 3512 const int *CompactionMask; 3513 if (MatchEvenMask) 3514 CompactionMask = CompactionMaskEven; 3515 else if (MatchOddMask) 3516 CompactionMask = CompactionMaskOdd; 3517 else 3518 return SDValue(); 3519 3520 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); 3521 3522 SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0), 3523 UndefNode, CompactionMask); 3524 SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1), 3525 UndefNode, CompactionMask); 3526 static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13}; 3527 return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask); 3528} 3529 3530/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3531/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3532static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, 3533 bool HasAVX2, bool V2IsSplat = false) { 3534 unsigned NumElts = VT.getVectorNumElements(); 3535 3536 assert((VT.is128BitVector() || VT.is256BitVector()) && 3537 "Unsupported vector type for unpckh"); 3538 3539 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3540 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3541 return false; 3542 3543 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3544 // independently on 128-bit lanes. 3545 unsigned NumLanes = VT.getSizeInBits()/128; 3546 unsigned NumLaneElts = NumElts/NumLanes; 3547 3548 for (unsigned l = 0; l != NumLanes; ++l) { 3549 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3550 i != (l+1)*NumLaneElts; 3551 i += 2, ++j) { 3552 int BitI = Mask[i]; 3553 int BitI1 = Mask[i+1]; 3554 if (!isUndefOrEqual(BitI, j)) 3555 return false; 3556 if (V2IsSplat) { 3557 if (!isUndefOrEqual(BitI1, NumElts)) 3558 return false; 3559 } else { 3560 if (!isUndefOrEqual(BitI1, j + NumElts)) 3561 return false; 3562 } 3563 } 3564 } 3565 3566 return true; 3567} 3568 3569/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3570/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3571static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, 3572 bool HasAVX2, bool V2IsSplat = false) { 3573 unsigned NumElts = VT.getVectorNumElements(); 3574 3575 assert((VT.is128BitVector() || VT.is256BitVector()) && 3576 "Unsupported vector type for unpckh"); 3577 3578 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3579 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3580 return false; 3581 3582 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3583 // independently on 128-bit lanes. 3584 unsigned NumLanes = VT.getSizeInBits()/128; 3585 unsigned NumLaneElts = NumElts/NumLanes; 3586 3587 for (unsigned l = 0; l != NumLanes; ++l) { 3588 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3589 i != (l+1)*NumLaneElts; i += 2, ++j) { 3590 int BitI = Mask[i]; 3591 int BitI1 = Mask[i+1]; 3592 if (!isUndefOrEqual(BitI, j)) 3593 return false; 3594 if (V2IsSplat) { 3595 if (isUndefOrEqual(BitI1, NumElts)) 3596 return false; 3597 } else { 3598 if (!isUndefOrEqual(BitI1, j+NumElts)) 3599 return false; 3600 } 3601 } 3602 } 3603 return true; 3604} 3605 3606/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3607/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3608/// <0, 0, 1, 1> 3609static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, 3610 bool HasAVX2) { 3611 unsigned NumElts = VT.getVectorNumElements(); 3612 3613 assert((VT.is128BitVector() || VT.is256BitVector()) && 3614 "Unsupported vector type for unpckh"); 3615 3616 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3617 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3618 return false; 3619 3620 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 3621 // FIXME: Need a better way to get rid of this, there's no latency difference 3622 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 3623 // the former later. We should also remove the "_undef" special mask. 3624 if (NumElts == 4 && VT.getSizeInBits() == 256) 3625 return false; 3626 3627 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3628 // independently on 128-bit lanes. 3629 unsigned NumLanes = VT.getSizeInBits()/128; 3630 unsigned NumLaneElts = NumElts/NumLanes; 3631 3632 for (unsigned l = 0; l != NumLanes; ++l) { 3633 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3634 i != (l+1)*NumLaneElts; 3635 i += 2, ++j) { 3636 int BitI = Mask[i]; 3637 int BitI1 = Mask[i+1]; 3638 3639 if (!isUndefOrEqual(BitI, j)) 3640 return false; 3641 if (!isUndefOrEqual(BitI1, j)) 3642 return false; 3643 } 3644 } 3645 3646 return true; 3647} 3648 3649/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3650/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3651/// <2, 2, 3, 3> 3652static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { 3653 unsigned NumElts = VT.getVectorNumElements(); 3654 3655 assert((VT.is128BitVector() || VT.is256BitVector()) && 3656 "Unsupported vector type for unpckh"); 3657 3658 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3659 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3660 return false; 3661 3662 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3663 // independently on 128-bit lanes. 3664 unsigned NumLanes = VT.getSizeInBits()/128; 3665 unsigned NumLaneElts = NumElts/NumLanes; 3666 3667 for (unsigned l = 0; l != NumLanes; ++l) { 3668 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3669 i != (l+1)*NumLaneElts; i += 2, ++j) { 3670 int BitI = Mask[i]; 3671 int BitI1 = Mask[i+1]; 3672 if (!isUndefOrEqual(BitI, j)) 3673 return false; 3674 if (!isUndefOrEqual(BitI1, j)) 3675 return false; 3676 } 3677 } 3678 return true; 3679} 3680 3681/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3682/// specifies a shuffle of elements that is suitable for input to MOVSS, 3683/// MOVSD, and MOVD, i.e. setting the lowest element. 3684static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { 3685 if (VT.getVectorElementType().getSizeInBits() < 32) 3686 return false; 3687 if (!VT.is128BitVector()) 3688 return false; 3689 3690 unsigned NumElts = VT.getVectorNumElements(); 3691 3692 if (!isUndefOrEqual(Mask[0], NumElts)) 3693 return false; 3694 3695 for (unsigned i = 1; i != NumElts; ++i) 3696 if (!isUndefOrEqual(Mask[i], i)) 3697 return false; 3698 3699 return true; 3700} 3701 3702/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered 3703/// as permutations between 128-bit chunks or halves. As an example: this 3704/// shuffle bellow: 3705/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 3706/// The first half comes from the second half of V1 and the second half from the 3707/// the second half of V2. 3708static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { 3709 if (!HasAVX || !VT.is256BitVector()) 3710 return false; 3711 3712 // The shuffle result is divided into half A and half B. In total the two 3713 // sources have 4 halves, namely: C, D, E, F. The final values of A and 3714 // B must come from C, D, E or F. 3715 unsigned HalfSize = VT.getVectorNumElements()/2; 3716 bool MatchA = false, MatchB = false; 3717 3718 // Check if A comes from one of C, D, E, F. 3719 for (unsigned Half = 0; Half != 4; ++Half) { 3720 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 3721 MatchA = true; 3722 break; 3723 } 3724 } 3725 3726 // Check if B comes from one of C, D, E, F. 3727 for (unsigned Half = 0; Half != 4; ++Half) { 3728 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 3729 MatchB = true; 3730 break; 3731 } 3732 } 3733 3734 return MatchA && MatchB; 3735} 3736 3737/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle 3738/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. 3739static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { 3740 EVT VT = SVOp->getValueType(0); 3741 3742 unsigned HalfSize = VT.getVectorNumElements()/2; 3743 3744 unsigned FstHalf = 0, SndHalf = 0; 3745 for (unsigned i = 0; i < HalfSize; ++i) { 3746 if (SVOp->getMaskElt(i) > 0) { 3747 FstHalf = SVOp->getMaskElt(i)/HalfSize; 3748 break; 3749 } 3750 } 3751 for (unsigned i = HalfSize; i < HalfSize*2; ++i) { 3752 if (SVOp->getMaskElt(i) > 0) { 3753 SndHalf = SVOp->getMaskElt(i)/HalfSize; 3754 break; 3755 } 3756 } 3757 3758 return (FstHalf | (SndHalf << 4)); 3759} 3760 3761/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand 3762/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 3763/// Note that VPERMIL mask matching is different depending whether theunderlying 3764/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3765/// to the same elements of the low, but to the higher half of the source. 3766/// In VPERMILPD the two lanes could be shuffled independently of each other 3767/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. 3768static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { 3769 if (!HasAVX) 3770 return false; 3771 3772 unsigned NumElts = VT.getVectorNumElements(); 3773 // Only match 256-bit with 32/64-bit types 3774 if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) 3775 return false; 3776 3777 unsigned NumLanes = VT.getSizeInBits()/128; 3778 unsigned LaneSize = NumElts/NumLanes; 3779 for (unsigned l = 0; l != NumElts; l += LaneSize) { 3780 for (unsigned i = 0; i != LaneSize; ++i) { 3781 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 3782 return false; 3783 if (NumElts != 8 || l == 0) 3784 continue; 3785 // VPERMILPS handling 3786 if (Mask[i] < 0) 3787 continue; 3788 if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) 3789 return false; 3790 } 3791 } 3792 3793 return true; 3794} 3795 3796/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse 3797/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3798/// element of vector 2 and the other elements to come from vector 1 in order. 3799static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, 3800 bool V2IsSplat = false, bool V2IsUndef = false) { 3801 if (!VT.is128BitVector()) 3802 return false; 3803 3804 unsigned NumOps = VT.getVectorNumElements(); 3805 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3806 return false; 3807 3808 if (!isUndefOrEqual(Mask[0], 0)) 3809 return false; 3810 3811 for (unsigned i = 1; i != NumOps; ++i) 3812 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3813 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3814 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3815 return false; 3816 3817 return true; 3818} 3819 3820/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3821/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3822/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3823static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, 3824 const X86Subtarget *Subtarget) { 3825 if (!Subtarget->hasSSE3()) 3826 return false; 3827 3828 unsigned NumElems = VT.getVectorNumElements(); 3829 3830 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3831 (VT.getSizeInBits() == 256 && NumElems != 8)) 3832 return false; 3833 3834 // "i+1" is the value the indexed mask element must have 3835 for (unsigned i = 0; i != NumElems; i += 2) 3836 if (!isUndefOrEqual(Mask[i], i+1) || 3837 !isUndefOrEqual(Mask[i+1], i+1)) 3838 return false; 3839 3840 return true; 3841} 3842 3843/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3844/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3845/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3846static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, 3847 const X86Subtarget *Subtarget) { 3848 if (!Subtarget->hasSSE3()) 3849 return false; 3850 3851 unsigned NumElems = VT.getVectorNumElements(); 3852 3853 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3854 (VT.getSizeInBits() == 256 && NumElems != 8)) 3855 return false; 3856 3857 // "i" is the value the indexed mask element must have 3858 for (unsigned i = 0; i != NumElems; i += 2) 3859 if (!isUndefOrEqual(Mask[i], i) || 3860 !isUndefOrEqual(Mask[i+1], i)) 3861 return false; 3862 3863 return true; 3864} 3865 3866/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 3867/// specifies a shuffle of elements that is suitable for input to 256-bit 3868/// version of MOVDDUP. 3869static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { 3870 if (!HasAVX || !VT.is256BitVector()) 3871 return false; 3872 3873 unsigned NumElts = VT.getVectorNumElements(); 3874 if (NumElts != 4) 3875 return false; 3876 3877 for (unsigned i = 0; i != NumElts/2; ++i) 3878 if (!isUndefOrEqual(Mask[i], 0)) 3879 return false; 3880 for (unsigned i = NumElts/2; i != NumElts; ++i) 3881 if (!isUndefOrEqual(Mask[i], NumElts/2)) 3882 return false; 3883 return true; 3884} 3885 3886/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3887/// specifies a shuffle of elements that is suitable for input to 128-bit 3888/// version of MOVDDUP. 3889static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { 3890 if (!VT.is128BitVector()) 3891 return false; 3892 3893 unsigned e = VT.getVectorNumElements() / 2; 3894 for (unsigned i = 0; i != e; ++i) 3895 if (!isUndefOrEqual(Mask[i], i)) 3896 return false; 3897 for (unsigned i = 0; i != e; ++i) 3898 if (!isUndefOrEqual(Mask[e+i], i)) 3899 return false; 3900 return true; 3901} 3902 3903/// isVEXTRACTF128Index - Return true if the specified 3904/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3905/// suitable for input to VEXTRACTF128. 3906bool X86::isVEXTRACTF128Index(SDNode *N) { 3907 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3908 return false; 3909 3910 // The index should be aligned on a 128-bit boundary. 3911 uint64_t Index = 3912 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3913 3914 unsigned VL = N->getValueType(0).getVectorNumElements(); 3915 unsigned VBits = N->getValueType(0).getSizeInBits(); 3916 unsigned ElSize = VBits / VL; 3917 bool Result = (Index * ElSize) % 128 == 0; 3918 3919 return Result; 3920} 3921 3922/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3923/// operand specifies a subvector insert that is suitable for input to 3924/// VINSERTF128. 3925bool X86::isVINSERTF128Index(SDNode *N) { 3926 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3927 return false; 3928 3929 // The index should be aligned on a 128-bit boundary. 3930 uint64_t Index = 3931 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3932 3933 unsigned VL = N->getValueType(0).getVectorNumElements(); 3934 unsigned VBits = N->getValueType(0).getSizeInBits(); 3935 unsigned ElSize = VBits / VL; 3936 bool Result = (Index * ElSize) % 128 == 0; 3937 3938 return Result; 3939} 3940 3941/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3942/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3943/// Handles 128-bit and 256-bit. 3944static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { 3945 EVT VT = N->getValueType(0); 3946 3947 assert((VT.is128BitVector() || VT.is256BitVector()) && 3948 "Unsupported vector type for PSHUF/SHUFP"); 3949 3950 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate 3951 // independently on 128-bit lanes. 3952 unsigned NumElts = VT.getVectorNumElements(); 3953 unsigned NumLanes = VT.getSizeInBits()/128; 3954 unsigned NumLaneElts = NumElts/NumLanes; 3955 3956 assert((NumLaneElts == 2 || NumLaneElts == 4) && 3957 "Only supports 2 or 4 elements per lane"); 3958 3959 unsigned Shift = (NumLaneElts == 4) ? 1 : 0; 3960 unsigned Mask = 0; 3961 for (unsigned i = 0; i != NumElts; ++i) { 3962 int Elt = N->getMaskElt(i); 3963 if (Elt < 0) continue; 3964 Elt &= NumLaneElts - 1; 3965 unsigned ShAmt = (i << Shift) % 8; 3966 Mask |= Elt << ShAmt; 3967 } 3968 3969 return Mask; 3970} 3971 3972/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3973/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3974static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { 3975 EVT VT = N->getValueType(0); 3976 3977 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 3978 "Unsupported vector type for PSHUFHW"); 3979 3980 unsigned NumElts = VT.getVectorNumElements(); 3981 3982 unsigned Mask = 0; 3983 for (unsigned l = 0; l != NumElts; l += 8) { 3984 // 8 nodes per lane, but we only care about the last 4. 3985 for (unsigned i = 0; i < 4; ++i) { 3986 int Elt = N->getMaskElt(l+i+4); 3987 if (Elt < 0) continue; 3988 Elt &= 0x3; // only 2-bits. 3989 Mask |= Elt << (i * 2); 3990 } 3991 } 3992 3993 return Mask; 3994} 3995 3996/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3997/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3998static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { 3999 EVT VT = N->getValueType(0); 4000 4001 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 4002 "Unsupported vector type for PSHUFHW"); 4003 4004 unsigned NumElts = VT.getVectorNumElements(); 4005 4006 unsigned Mask = 0; 4007 for (unsigned l = 0; l != NumElts; l += 8) { 4008 // 8 nodes per lane, but we only care about the first 4. 4009 for (unsigned i = 0; i < 4; ++i) { 4010 int Elt = N->getMaskElt(l+i); 4011 if (Elt < 0) continue; 4012 Elt &= 0x3; // only 2-bits 4013 Mask |= Elt << (i * 2); 4014 } 4015 } 4016 4017 return Mask; 4018} 4019 4020/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 4021/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 4022static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { 4023 EVT VT = SVOp->getValueType(0); 4024 unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; 4025 4026 unsigned NumElts = VT.getVectorNumElements(); 4027 unsigned NumLanes = VT.getSizeInBits()/128; 4028 unsigned NumLaneElts = NumElts/NumLanes; 4029 4030 int Val = 0; 4031 unsigned i; 4032 for (i = 0; i != NumElts; ++i) { 4033 Val = SVOp->getMaskElt(i); 4034 if (Val >= 0) 4035 break; 4036 } 4037 if (Val >= (int)NumElts) 4038 Val -= NumElts - NumLaneElts; 4039 4040 assert(Val - i > 0 && "PALIGNR imm should be positive"); 4041 return (Val - i) * EltSize; 4042} 4043 4044/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 4045/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 4046/// instructions. 4047unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 4048 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4049 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 4050 4051 uint64_t Index = 4052 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4053 4054 EVT VecVT = N->getOperand(0).getValueType(); 4055 EVT ElVT = VecVT.getVectorElementType(); 4056 4057 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4058 return Index / NumElemsPerChunk; 4059} 4060 4061/// getInsertVINSERTF128Immediate - Return the appropriate immediate 4062/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 4063/// instructions. 4064unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 4065 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4066 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 4067 4068 uint64_t Index = 4069 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4070 4071 EVT VecVT = N->getValueType(0); 4072 EVT ElVT = VecVT.getVectorElementType(); 4073 4074 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4075 return Index / NumElemsPerChunk; 4076} 4077 4078/// getShuffleCLImmediate - Return the appropriate immediate to shuffle 4079/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. 4080/// Handles 256-bit. 4081static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { 4082 EVT VT = N->getValueType(0); 4083 4084 unsigned NumElts = VT.getVectorNumElements(); 4085 4086 assert((VT.is256BitVector() && NumElts == 4) && 4087 "Unsupported vector type for VPERMQ/VPERMPD"); 4088 4089 unsigned Mask = 0; 4090 for (unsigned i = 0; i != NumElts; ++i) { 4091 int Elt = N->getMaskElt(i); 4092 if (Elt < 0) 4093 continue; 4094 Mask |= Elt << (i*2); 4095 } 4096 4097 return Mask; 4098} 4099/// isZeroNode - Returns true if Elt is a constant zero or a floating point 4100/// constant +0.0. 4101bool X86::isZeroNode(SDValue Elt) { 4102 return ((isa<ConstantSDNode>(Elt) && 4103 cast<ConstantSDNode>(Elt)->isNullValue()) || 4104 (isa<ConstantFPSDNode>(Elt) && 4105 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 4106} 4107 4108/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 4109/// their permute mask. 4110static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 4111 SelectionDAG &DAG) { 4112 EVT VT = SVOp->getValueType(0); 4113 unsigned NumElems = VT.getVectorNumElements(); 4114 SmallVector<int, 8> MaskVec; 4115 4116 for (unsigned i = 0; i != NumElems; ++i) { 4117 int Idx = SVOp->getMaskElt(i); 4118 if (Idx >= 0) { 4119 if (Idx < (int)NumElems) 4120 Idx += NumElems; 4121 else 4122 Idx -= NumElems; 4123 } 4124 MaskVec.push_back(Idx); 4125 } 4126 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 4127 SVOp->getOperand(0), &MaskVec[0]); 4128} 4129 4130/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 4131/// match movhlps. The lower half elements should come from upper half of 4132/// V1 (and in order), and the upper half elements should come from the upper 4133/// half of V2 (and in order). 4134static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { 4135 if (!VT.is128BitVector()) 4136 return false; 4137 if (VT.getVectorNumElements() != 4) 4138 return false; 4139 for (unsigned i = 0, e = 2; i != e; ++i) 4140 if (!isUndefOrEqual(Mask[i], i+2)) 4141 return false; 4142 for (unsigned i = 2; i != 4; ++i) 4143 if (!isUndefOrEqual(Mask[i], i+4)) 4144 return false; 4145 return true; 4146} 4147 4148/// isScalarLoadToVector - Returns true if the node is a scalar load that 4149/// is promoted to a vector. It also returns the LoadSDNode by reference if 4150/// required. 4151static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 4152 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 4153 return false; 4154 N = N->getOperand(0).getNode(); 4155 if (!ISD::isNON_EXTLoad(N)) 4156 return false; 4157 if (LD) 4158 *LD = cast<LoadSDNode>(N); 4159 return true; 4160} 4161 4162// Test whether the given value is a vector value which will be legalized 4163// into a load. 4164static bool WillBeConstantPoolLoad(SDNode *N) { 4165 if (N->getOpcode() != ISD::BUILD_VECTOR) 4166 return false; 4167 4168 // Check for any non-constant elements. 4169 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 4170 switch (N->getOperand(i).getNode()->getOpcode()) { 4171 case ISD::UNDEF: 4172 case ISD::ConstantFP: 4173 case ISD::Constant: 4174 break; 4175 default: 4176 return false; 4177 } 4178 4179 // Vectors of all-zeros and all-ones are materialized with special 4180 // instructions rather than being loaded. 4181 return !ISD::isBuildVectorAllZeros(N) && 4182 !ISD::isBuildVectorAllOnes(N); 4183} 4184 4185/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 4186/// match movlp{s|d}. The lower half elements should come from lower half of 4187/// V1 (and in order), and the upper half elements should come from the upper 4188/// half of V2 (and in order). And since V1 will become the source of the 4189/// MOVLP, it must be either a vector load or a scalar load to vector. 4190static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 4191 ArrayRef<int> Mask, EVT VT) { 4192 if (!VT.is128BitVector()) 4193 return false; 4194 4195 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 4196 return false; 4197 // Is V2 is a vector load, don't do this transformation. We will try to use 4198 // load folding shufps op. 4199 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) 4200 return false; 4201 4202 unsigned NumElems = VT.getVectorNumElements(); 4203 4204 if (NumElems != 2 && NumElems != 4) 4205 return false; 4206 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4207 if (!isUndefOrEqual(Mask[i], i)) 4208 return false; 4209 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 4210 if (!isUndefOrEqual(Mask[i], i+NumElems)) 4211 return false; 4212 return true; 4213} 4214 4215/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4216/// all the same. 4217static bool isSplatVector(SDNode *N) { 4218 if (N->getOpcode() != ISD::BUILD_VECTOR) 4219 return false; 4220 4221 SDValue SplatValue = N->getOperand(0); 4222 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4223 if (N->getOperand(i) != SplatValue) 4224 return false; 4225 return true; 4226} 4227 4228/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4229/// to an zero vector. 4230/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4231static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4232 SDValue V1 = N->getOperand(0); 4233 SDValue V2 = N->getOperand(1); 4234 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4235 for (unsigned i = 0; i != NumElems; ++i) { 4236 int Idx = N->getMaskElt(i); 4237 if (Idx >= (int)NumElems) { 4238 unsigned Opc = V2.getOpcode(); 4239 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4240 continue; 4241 if (Opc != ISD::BUILD_VECTOR || 4242 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4243 return false; 4244 } else if (Idx >= 0) { 4245 unsigned Opc = V1.getOpcode(); 4246 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4247 continue; 4248 if (Opc != ISD::BUILD_VECTOR || 4249 !X86::isZeroNode(V1.getOperand(Idx))) 4250 return false; 4251 } 4252 } 4253 return true; 4254} 4255 4256/// getZeroVector - Returns a vector of specified type with all zero elements. 4257/// 4258static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, 4259 SelectionDAG &DAG, DebugLoc dl) { 4260 assert(VT.isVector() && "Expected a vector type"); 4261 unsigned Size = VT.getSizeInBits(); 4262 4263 // Always build SSE zero vectors as <4 x i32> bitcasted 4264 // to their dest type. This ensures they get CSE'd. 4265 SDValue Vec; 4266 if (Size == 128) { // SSE 4267 if (Subtarget->hasSSE2()) { // SSE2 4268 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4269 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4270 } else { // SSE1 4271 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4272 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4273 } 4274 } else if (Size == 256) { // AVX 4275 if (Subtarget->hasAVX2()) { // AVX2 4276 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4277 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4278 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4279 } else { 4280 // 256-bit logic and arithmetic instructions in AVX are all 4281 // floating-point, no support for integer ops. Emit fp zeroed vectors. 4282 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4283 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4284 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 4285 } 4286 } else 4287 llvm_unreachable("Unexpected vector type"); 4288 4289 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4290} 4291 4292/// getOnesVector - Returns a vector of specified type with all bits set. 4293/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4294/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4295/// Then bitcast to their original type, ensuring they get CSE'd. 4296static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, 4297 DebugLoc dl) { 4298 assert(VT.isVector() && "Expected a vector type"); 4299 unsigned Size = VT.getSizeInBits(); 4300 4301 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4302 SDValue Vec; 4303 if (Size == 256) { 4304 if (HasAVX2) { // AVX2 4305 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4306 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4307 } else { // AVX 4308 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4309 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 4310 } 4311 } else if (Size == 128) { 4312 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4313 } else 4314 llvm_unreachable("Unexpected vector type"); 4315 4316 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4317} 4318 4319/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4320/// that point to V2 points to its first element. 4321static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { 4322 for (unsigned i = 0; i != NumElems; ++i) { 4323 if (Mask[i] > (int)NumElems) { 4324 Mask[i] = NumElems; 4325 } 4326 } 4327} 4328 4329/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4330/// operation of specified width. 4331static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4332 SDValue V2) { 4333 unsigned NumElems = VT.getVectorNumElements(); 4334 SmallVector<int, 8> Mask; 4335 Mask.push_back(NumElems); 4336 for (unsigned i = 1; i != NumElems; ++i) 4337 Mask.push_back(i); 4338 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4339} 4340 4341/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4342static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4343 SDValue V2) { 4344 unsigned NumElems = VT.getVectorNumElements(); 4345 SmallVector<int, 8> Mask; 4346 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4347 Mask.push_back(i); 4348 Mask.push_back(i + NumElems); 4349 } 4350 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4351} 4352 4353/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4354static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4355 SDValue V2) { 4356 unsigned NumElems = VT.getVectorNumElements(); 4357 SmallVector<int, 8> Mask; 4358 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 4359 Mask.push_back(i + Half); 4360 Mask.push_back(i + NumElems + Half); 4361 } 4362 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4363} 4364 4365// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4366// a generic shuffle instruction because the target has no such instructions. 4367// Generate shuffles which repeat i16 and i8 several times until they can be 4368// represented by v4f32 and then be manipulated by target suported shuffles. 4369static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4370 EVT VT = V.getValueType(); 4371 int NumElems = VT.getVectorNumElements(); 4372 DebugLoc dl = V.getDebugLoc(); 4373 4374 while (NumElems > 4) { 4375 if (EltNo < NumElems/2) { 4376 V = getUnpackl(DAG, dl, VT, V, V); 4377 } else { 4378 V = getUnpackh(DAG, dl, VT, V, V); 4379 EltNo -= NumElems/2; 4380 } 4381 NumElems >>= 1; 4382 } 4383 return V; 4384} 4385 4386/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4387static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4388 EVT VT = V.getValueType(); 4389 DebugLoc dl = V.getDebugLoc(); 4390 unsigned Size = VT.getSizeInBits(); 4391 4392 if (Size == 128) { 4393 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 4394 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4395 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 4396 &SplatMask[0]); 4397 } else if (Size == 256) { 4398 // To use VPERMILPS to splat scalars, the second half of indicies must 4399 // refer to the higher part, which is a duplication of the lower one, 4400 // because VPERMILPS can only handle in-lane permutations. 4401 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4402 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4403 4404 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 4405 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 4406 &SplatMask[0]); 4407 } else 4408 llvm_unreachable("Vector size not supported"); 4409 4410 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4411} 4412 4413/// PromoteSplat - Splat is promoted to target supported vector shuffles. 4414static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4415 EVT SrcVT = SV->getValueType(0); 4416 SDValue V1 = SV->getOperand(0); 4417 DebugLoc dl = SV->getDebugLoc(); 4418 4419 int EltNo = SV->getSplatIndex(); 4420 int NumElems = SrcVT.getVectorNumElements(); 4421 unsigned Size = SrcVT.getSizeInBits(); 4422 4423 assert(((Size == 128 && NumElems > 4) || Size == 256) && 4424 "Unknown how to promote splat for type"); 4425 4426 // Extract the 128-bit part containing the splat element and update 4427 // the splat element index when it refers to the higher register. 4428 if (Size == 256) { 4429 V1 = Extract128BitVector(V1, EltNo, DAG, dl); 4430 if (EltNo >= NumElems/2) 4431 EltNo -= NumElems/2; 4432 } 4433 4434 // All i16 and i8 vector types can't be used directly by a generic shuffle 4435 // instruction because the target has no such instruction. Generate shuffles 4436 // which repeat i16 and i8 several times until they fit in i32, and then can 4437 // be manipulated by target suported shuffles. 4438 EVT EltVT = SrcVT.getVectorElementType(); 4439 if (EltVT == MVT::i8 || EltVT == MVT::i16) 4440 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4441 4442 // Recreate the 256-bit vector and place the same 128-bit vector 4443 // into the low and high part. This is necessary because we want 4444 // to use VPERM* to shuffle the vectors 4445 if (Size == 256) { 4446 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); 4447 } 4448 4449 return getLegalSplat(DAG, V1, EltNo); 4450} 4451 4452/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4453/// vector of zero or undef vector. This produces a shuffle where the low 4454/// element of V2 is swizzled into the zero/undef vector, landing at element 4455/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4456static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4457 bool IsZero, 4458 const X86Subtarget *Subtarget, 4459 SelectionDAG &DAG) { 4460 EVT VT = V2.getValueType(); 4461 SDValue V1 = IsZero 4462 ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4463 unsigned NumElems = VT.getVectorNumElements(); 4464 SmallVector<int, 16> MaskVec; 4465 for (unsigned i = 0; i != NumElems; ++i) 4466 // If this is the insertion idx, put the low elt of V2 here. 4467 MaskVec.push_back(i == Idx ? NumElems : i); 4468 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4469} 4470 4471/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the 4472/// target specific opcode. Returns true if the Mask could be calculated. 4473/// Sets IsUnary to true if only uses one source. 4474static bool getTargetShuffleMask(SDNode *N, MVT VT, 4475 SmallVectorImpl<int> &Mask, bool &IsUnary) { 4476 unsigned NumElems = VT.getVectorNumElements(); 4477 SDValue ImmN; 4478 4479 IsUnary = false; 4480 switch(N->getOpcode()) { 4481 case X86ISD::SHUFP: 4482 ImmN = N->getOperand(N->getNumOperands()-1); 4483 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4484 break; 4485 case X86ISD::UNPCKH: 4486 DecodeUNPCKHMask(VT, Mask); 4487 break; 4488 case X86ISD::UNPCKL: 4489 DecodeUNPCKLMask(VT, Mask); 4490 break; 4491 case X86ISD::MOVHLPS: 4492 DecodeMOVHLPSMask(NumElems, Mask); 4493 break; 4494 case X86ISD::MOVLHPS: 4495 DecodeMOVLHPSMask(NumElems, Mask); 4496 break; 4497 case X86ISD::PSHUFD: 4498 case X86ISD::VPERMILP: 4499 ImmN = N->getOperand(N->getNumOperands()-1); 4500 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4501 IsUnary = true; 4502 break; 4503 case X86ISD::PSHUFHW: 4504 ImmN = N->getOperand(N->getNumOperands()-1); 4505 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4506 IsUnary = true; 4507 break; 4508 case X86ISD::PSHUFLW: 4509 ImmN = N->getOperand(N->getNumOperands()-1); 4510 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4511 IsUnary = true; 4512 break; 4513 case X86ISD::VPERMI: 4514 ImmN = N->getOperand(N->getNumOperands()-1); 4515 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4516 IsUnary = true; 4517 break; 4518 case X86ISD::MOVSS: 4519 case X86ISD::MOVSD: { 4520 // The index 0 always comes from the first element of the second source, 4521 // this is why MOVSS and MOVSD are used in the first place. The other 4522 // elements come from the other positions of the first source vector 4523 Mask.push_back(NumElems); 4524 for (unsigned i = 1; i != NumElems; ++i) { 4525 Mask.push_back(i); 4526 } 4527 break; 4528 } 4529 case X86ISD::VPERM2X128: 4530 ImmN = N->getOperand(N->getNumOperands()-1); 4531 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 4532 if (Mask.empty()) return false; 4533 break; 4534 case X86ISD::MOVDDUP: 4535 case X86ISD::MOVLHPD: 4536 case X86ISD::MOVLPD: 4537 case X86ISD::MOVLPS: 4538 case X86ISD::MOVSHDUP: 4539 case X86ISD::MOVSLDUP: 4540 case X86ISD::PALIGN: 4541 // Not yet implemented 4542 return false; 4543 default: llvm_unreachable("unknown target shuffle node"); 4544 } 4545 4546 return true; 4547} 4548 4549/// getShuffleScalarElt - Returns the scalar element that will make up the ith 4550/// element of the result of the vector shuffle. 4551static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 4552 unsigned Depth) { 4553 if (Depth == 6) 4554 return SDValue(); // Limit search depth. 4555 4556 SDValue V = SDValue(N, 0); 4557 EVT VT = V.getValueType(); 4558 unsigned Opcode = V.getOpcode(); 4559 4560 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4561 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4562 int Elt = SV->getMaskElt(Index); 4563 4564 if (Elt < 0) 4565 return DAG.getUNDEF(VT.getVectorElementType()); 4566 4567 unsigned NumElems = VT.getVectorNumElements(); 4568 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 4569 : SV->getOperand(1); 4570 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 4571 } 4572 4573 // Recurse into target specific vector shuffles to find scalars. 4574 if (isTargetShuffle(Opcode)) { 4575 MVT ShufVT = V.getValueType().getSimpleVT(); 4576 unsigned NumElems = ShufVT.getVectorNumElements(); 4577 SmallVector<int, 16> ShuffleMask; 4578 SDValue ImmN; 4579 bool IsUnary; 4580 4581 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) 4582 return SDValue(); 4583 4584 int Elt = ShuffleMask[Index]; 4585 if (Elt < 0) 4586 return DAG.getUNDEF(ShufVT.getVectorElementType()); 4587 4588 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) 4589 : N->getOperand(1); 4590 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 4591 Depth+1); 4592 } 4593 4594 // Actual nodes that may contain scalar elements 4595 if (Opcode == ISD::BITCAST) { 4596 V = V.getOperand(0); 4597 EVT SrcVT = V.getValueType(); 4598 unsigned NumElems = VT.getVectorNumElements(); 4599 4600 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4601 return SDValue(); 4602 } 4603 4604 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4605 return (Index == 0) ? V.getOperand(0) 4606 : DAG.getUNDEF(VT.getVectorElementType()); 4607 4608 if (V.getOpcode() == ISD::BUILD_VECTOR) 4609 return V.getOperand(Index); 4610 4611 return SDValue(); 4612} 4613 4614/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4615/// shuffle operation which come from a consecutively from a zero. The 4616/// search can start in two different directions, from left or right. 4617static 4618unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, 4619 bool ZerosFromLeft, SelectionDAG &DAG) { 4620 unsigned i; 4621 for (i = 0; i != NumElems; ++i) { 4622 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4623 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); 4624 if (!(Elt.getNode() && 4625 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4626 break; 4627 } 4628 4629 return i; 4630} 4631 4632/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) 4633/// correspond consecutively to elements from one of the vector operands, 4634/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4635static 4636bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, 4637 unsigned MaskI, unsigned MaskE, unsigned OpIdx, 4638 unsigned NumElems, unsigned &OpNum) { 4639 bool SeenV1 = false; 4640 bool SeenV2 = false; 4641 4642 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { 4643 int Idx = SVOp->getMaskElt(i); 4644 // Ignore undef indicies 4645 if (Idx < 0) 4646 continue; 4647 4648 if (Idx < (int)NumElems) 4649 SeenV1 = true; 4650 else 4651 SeenV2 = true; 4652 4653 // Only accept consecutive elements from the same vector 4654 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4655 return false; 4656 } 4657 4658 OpNum = SeenV1 ? 0 : 1; 4659 return true; 4660} 4661 4662/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4663/// logical left shift of a vector. 4664static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4665 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4666 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4667 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4668 false /* check zeros from right */, DAG); 4669 unsigned OpSrc; 4670 4671 if (!NumZeros) 4672 return false; 4673 4674 // Considering the elements in the mask that are not consecutive zeros, 4675 // check if they consecutively come from only one of the source vectors. 4676 // 4677 // V1 = {X, A, B, C} 0 4678 // \ \ \ / 4679 // vector_shuffle V1, V2 <1, 2, 3, X> 4680 // 4681 if (!isShuffleMaskConsecutive(SVOp, 4682 0, // Mask Start Index 4683 NumElems-NumZeros, // Mask End Index(exclusive) 4684 NumZeros, // Where to start looking in the src vector 4685 NumElems, // Number of elements in vector 4686 OpSrc)) // Which source operand ? 4687 return false; 4688 4689 isLeft = false; 4690 ShAmt = NumZeros; 4691 ShVal = SVOp->getOperand(OpSrc); 4692 return true; 4693} 4694 4695/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4696/// logical left shift of a vector. 4697static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4698 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4699 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4700 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4701 true /* check zeros from left */, DAG); 4702 unsigned OpSrc; 4703 4704 if (!NumZeros) 4705 return false; 4706 4707 // Considering the elements in the mask that are not consecutive zeros, 4708 // check if they consecutively come from only one of the source vectors. 4709 // 4710 // 0 { A, B, X, X } = V2 4711 // / \ / / 4712 // vector_shuffle V1, V2 <X, X, 4, 5> 4713 // 4714 if (!isShuffleMaskConsecutive(SVOp, 4715 NumZeros, // Mask Start Index 4716 NumElems, // Mask End Index(exclusive) 4717 0, // Where to start looking in the src vector 4718 NumElems, // Number of elements in vector 4719 OpSrc)) // Which source operand ? 4720 return false; 4721 4722 isLeft = true; 4723 ShAmt = NumZeros; 4724 ShVal = SVOp->getOperand(OpSrc); 4725 return true; 4726} 4727 4728/// isVectorShift - Returns true if the shuffle can be implemented as a 4729/// logical left or right shift of a vector. 4730static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4731 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4732 // Although the logic below support any bitwidth size, there are no 4733 // shift instructions which handle more than 128-bit vectors. 4734 if (!SVOp->getValueType(0).is128BitVector()) 4735 return false; 4736 4737 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4738 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4739 return true; 4740 4741 return false; 4742} 4743 4744/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4745/// 4746static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4747 unsigned NumNonZero, unsigned NumZero, 4748 SelectionDAG &DAG, 4749 const X86Subtarget* Subtarget, 4750 const TargetLowering &TLI) { 4751 if (NumNonZero > 8) 4752 return SDValue(); 4753 4754 DebugLoc dl = Op.getDebugLoc(); 4755 SDValue V(0, 0); 4756 bool First = true; 4757 for (unsigned i = 0; i < 16; ++i) { 4758 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4759 if (ThisIsNonZero && First) { 4760 if (NumZero) 4761 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4762 else 4763 V = DAG.getUNDEF(MVT::v8i16); 4764 First = false; 4765 } 4766 4767 if ((i & 1) != 0) { 4768 SDValue ThisElt(0, 0), LastElt(0, 0); 4769 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4770 if (LastIsNonZero) { 4771 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4772 MVT::i16, Op.getOperand(i-1)); 4773 } 4774 if (ThisIsNonZero) { 4775 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4776 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4777 ThisElt, DAG.getConstant(8, MVT::i8)); 4778 if (LastIsNonZero) 4779 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4780 } else 4781 ThisElt = LastElt; 4782 4783 if (ThisElt.getNode()) 4784 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4785 DAG.getIntPtrConstant(i/2)); 4786 } 4787 } 4788 4789 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4790} 4791 4792/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4793/// 4794static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4795 unsigned NumNonZero, unsigned NumZero, 4796 SelectionDAG &DAG, 4797 const X86Subtarget* Subtarget, 4798 const TargetLowering &TLI) { 4799 if (NumNonZero > 4) 4800 return SDValue(); 4801 4802 DebugLoc dl = Op.getDebugLoc(); 4803 SDValue V(0, 0); 4804 bool First = true; 4805 for (unsigned i = 0; i < 8; ++i) { 4806 bool isNonZero = (NonZeros & (1 << i)) != 0; 4807 if (isNonZero) { 4808 if (First) { 4809 if (NumZero) 4810 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4811 else 4812 V = DAG.getUNDEF(MVT::v8i16); 4813 First = false; 4814 } 4815 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4816 MVT::v8i16, V, Op.getOperand(i), 4817 DAG.getIntPtrConstant(i)); 4818 } 4819 } 4820 4821 return V; 4822} 4823 4824/// getVShift - Return a vector logical shift node. 4825/// 4826static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4827 unsigned NumBits, SelectionDAG &DAG, 4828 const TargetLowering &TLI, DebugLoc dl) { 4829 assert(VT.is128BitVector() && "Unknown type for VShift"); 4830 EVT ShVT = MVT::v2i64; 4831 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 4832 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4833 return DAG.getNode(ISD::BITCAST, dl, VT, 4834 DAG.getNode(Opc, dl, ShVT, SrcOp, 4835 DAG.getConstant(NumBits, 4836 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4837} 4838 4839SDValue 4840X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4841 SelectionDAG &DAG) const { 4842 4843 // Check if the scalar load can be widened into a vector load. And if 4844 // the address is "base + cst" see if the cst can be "absorbed" into 4845 // the shuffle mask. 4846 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4847 SDValue Ptr = LD->getBasePtr(); 4848 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4849 return SDValue(); 4850 EVT PVT = LD->getValueType(0); 4851 if (PVT != MVT::i32 && PVT != MVT::f32) 4852 return SDValue(); 4853 4854 int FI = -1; 4855 int64_t Offset = 0; 4856 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4857 FI = FINode->getIndex(); 4858 Offset = 0; 4859 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4860 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4861 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4862 Offset = Ptr.getConstantOperandVal(1); 4863 Ptr = Ptr.getOperand(0); 4864 } else { 4865 return SDValue(); 4866 } 4867 4868 // FIXME: 256-bit vector instructions don't require a strict alignment, 4869 // improve this code to support it better. 4870 unsigned RequiredAlign = VT.getSizeInBits()/8; 4871 SDValue Chain = LD->getChain(); 4872 // Make sure the stack object alignment is at least 16 or 32. 4873 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4874 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 4875 if (MFI->isFixedObjectIndex(FI)) { 4876 // Can't change the alignment. FIXME: It's possible to compute 4877 // the exact stack offset and reference FI + adjust offset instead. 4878 // If someone *really* cares about this. That's the way to implement it. 4879 return SDValue(); 4880 } else { 4881 MFI->setObjectAlignment(FI, RequiredAlign); 4882 } 4883 } 4884 4885 // (Offset % 16 or 32) must be multiple of 4. Then address is then 4886 // Ptr + (Offset & ~15). 4887 if (Offset < 0) 4888 return SDValue(); 4889 if ((Offset % RequiredAlign) & 3) 4890 return SDValue(); 4891 int64_t StartOffset = Offset & ~(RequiredAlign-1); 4892 if (StartOffset) 4893 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4894 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4895 4896 int EltNo = (Offset - StartOffset) >> 2; 4897 unsigned NumElems = VT.getVectorNumElements(); 4898 4899 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 4900 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 4901 LD->getPointerInfo().getWithOffset(StartOffset), 4902 false, false, false, 0); 4903 4904 SmallVector<int, 8> Mask; 4905 for (unsigned i = 0; i != NumElems; ++i) 4906 Mask.push_back(EltNo); 4907 4908 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 4909 } 4910 4911 return SDValue(); 4912} 4913 4914/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4915/// vector of type 'VT', see if the elements can be replaced by a single large 4916/// load which has the same value as a build_vector whose operands are 'elts'. 4917/// 4918/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4919/// 4920/// FIXME: we'd also like to handle the case where the last elements are zero 4921/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4922/// There's even a handy isZeroNode for that purpose. 4923static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4924 DebugLoc &DL, SelectionDAG &DAG) { 4925 EVT EltVT = VT.getVectorElementType(); 4926 unsigned NumElems = Elts.size(); 4927 4928 LoadSDNode *LDBase = NULL; 4929 unsigned LastLoadedElt = -1U; 4930 4931 // For each element in the initializer, see if we've found a load or an undef. 4932 // If we don't find an initial load element, or later load elements are 4933 // non-consecutive, bail out. 4934 for (unsigned i = 0; i < NumElems; ++i) { 4935 SDValue Elt = Elts[i]; 4936 4937 if (!Elt.getNode() || 4938 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4939 return SDValue(); 4940 if (!LDBase) { 4941 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4942 return SDValue(); 4943 LDBase = cast<LoadSDNode>(Elt.getNode()); 4944 LastLoadedElt = i; 4945 continue; 4946 } 4947 if (Elt.getOpcode() == ISD::UNDEF) 4948 continue; 4949 4950 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4951 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4952 return SDValue(); 4953 LastLoadedElt = i; 4954 } 4955 4956 // If we have found an entire vector of loads and undefs, then return a large 4957 // load of the entire vector width starting at the base pointer. If we found 4958 // consecutive loads for the low half, generate a vzext_load node. 4959 if (LastLoadedElt == NumElems - 1) { 4960 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4961 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4962 LDBase->getPointerInfo(), 4963 LDBase->isVolatile(), LDBase->isNonTemporal(), 4964 LDBase->isInvariant(), 0); 4965 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4966 LDBase->getPointerInfo(), 4967 LDBase->isVolatile(), LDBase->isNonTemporal(), 4968 LDBase->isInvariant(), LDBase->getAlignment()); 4969 } 4970 if (NumElems == 4 && LastLoadedElt == 1 && 4971 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 4972 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4973 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4974 SDValue ResNode = 4975 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64, 4976 LDBase->getPointerInfo(), 4977 LDBase->getAlignment(), 4978 false/*isVolatile*/, true/*ReadMem*/, 4979 false/*WriteMem*/); 4980 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4981 } 4982 return SDValue(); 4983} 4984 4985/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction 4986/// to generate a splat value for the following cases: 4987/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 4988/// 2. A splat shuffle which uses a scalar_to_vector node which comes from 4989/// a scalar load, or a constant. 4990/// The VBROADCAST node is returned when a pattern is found, 4991/// or SDValue() otherwise. 4992SDValue 4993X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { 4994 if (!Subtarget->hasAVX()) 4995 return SDValue(); 4996 4997 EVT VT = Op.getValueType(); 4998 DebugLoc dl = Op.getDebugLoc(); 4999 5000 assert((VT.is128BitVector() || VT.is256BitVector()) && 5001 "Unsupported vector type for broadcast."); 5002 5003 SDValue Ld; 5004 bool ConstSplatVal; 5005 5006 switch (Op.getOpcode()) { 5007 default: 5008 // Unknown pattern found. 5009 return SDValue(); 5010 5011 case ISD::BUILD_VECTOR: { 5012 // The BUILD_VECTOR node must be a splat. 5013 if (!isSplatVector(Op.getNode())) 5014 return SDValue(); 5015 5016 Ld = Op.getOperand(0); 5017 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5018 Ld.getOpcode() == ISD::ConstantFP); 5019 5020 // The suspected load node has several users. Make sure that all 5021 // of its users are from the BUILD_VECTOR node. 5022 // Constants may have multiple users. 5023 if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) 5024 return SDValue(); 5025 break; 5026 } 5027 5028 case ISD::VECTOR_SHUFFLE: { 5029 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5030 5031 // Shuffles must have a splat mask where the first element is 5032 // broadcasted. 5033 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 5034 return SDValue(); 5035 5036 SDValue Sc = Op.getOperand(0); 5037 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 5038 Sc.getOpcode() != ISD::BUILD_VECTOR) { 5039 5040 if (!Subtarget->hasAVX2()) 5041 return SDValue(); 5042 5043 // Use the register form of the broadcast instruction available on AVX2. 5044 if (VT.is256BitVector()) 5045 Sc = Extract128BitVector(Sc, 0, DAG, dl); 5046 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 5047 } 5048 5049 Ld = Sc.getOperand(0); 5050 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 5051 Ld.getOpcode() == ISD::ConstantFP); 5052 5053 // The scalar_to_vector node and the suspected 5054 // load node must have exactly one user. 5055 // Constants may have multiple users. 5056 if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) 5057 return SDValue(); 5058 break; 5059 } 5060 } 5061 5062 bool Is256 = VT.is256BitVector(); 5063 5064 // Handle the broadcasting a single constant scalar from the constant pool 5065 // into a vector. On Sandybridge it is still better to load a constant vector 5066 // from the constant pool and not to broadcast it from a scalar. 5067 if (ConstSplatVal && Subtarget->hasAVX2()) { 5068 EVT CVT = Ld.getValueType(); 5069 assert(!CVT.isVector() && "Must not broadcast a vector type"); 5070 unsigned ScalarSize = CVT.getSizeInBits(); 5071 5072 if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { 5073 const Constant *C = 0; 5074 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 5075 C = CI->getConstantIntValue(); 5076 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 5077 C = CF->getConstantFPValue(); 5078 5079 assert(C && "Invalid constant type"); 5080 5081 SDValue CP = DAG.getConstantPool(C, getPointerTy()); 5082 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 5083 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, 5084 MachinePointerInfo::getConstantPool(), 5085 false, false, false, Alignment); 5086 5087 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5088 } 5089 } 5090 5091 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 5092 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 5093 5094 // Handle AVX2 in-register broadcasts. 5095 if (!IsLoad && Subtarget->hasAVX2() && 5096 (ScalarSize == 32 || (Is256 && ScalarSize == 64))) 5097 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5098 5099 // The scalar source must be a normal load. 5100 if (!IsLoad) 5101 return SDValue(); 5102 5103 if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) 5104 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5105 5106 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 5107 // double since there is no vbroadcastsd xmm 5108 if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { 5109 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 5110 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 5111 } 5112 5113 // Unsupported broadcast. 5114 return SDValue(); 5115} 5116 5117// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64 5118// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the 5119// constraint of matching input/output vector elements. 5120SDValue 5121X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const { 5122 DebugLoc DL = Op.getDebugLoc(); 5123 SDNode *N = Op.getNode(); 5124 EVT VT = Op.getValueType(); 5125 unsigned NumElts = Op.getNumOperands(); 5126 5127 // Check supported types and sub-targets. 5128 // 5129 // Only v2f32 -> v2f64 needs special handling. 5130 if (VT != MVT::v2f64 || !Subtarget->hasSSE2()) 5131 return SDValue(); 5132 5133 SDValue VecIn; 5134 EVT VecInVT; 5135 SmallVector<int, 8> Mask; 5136 EVT SrcVT = MVT::Other; 5137 5138 // Check the patterns could be translated into X86vfpext. 5139 for (unsigned i = 0; i < NumElts; ++i) { 5140 SDValue In = N->getOperand(i); 5141 unsigned Opcode = In.getOpcode(); 5142 5143 // Skip if the element is undefined. 5144 if (Opcode == ISD::UNDEF) { 5145 Mask.push_back(-1); 5146 continue; 5147 } 5148 5149 // Quit if one of the elements is not defined from 'fpext'. 5150 if (Opcode != ISD::FP_EXTEND) 5151 return SDValue(); 5152 5153 // Check how the source of 'fpext' is defined. 5154 SDValue L2In = In.getOperand(0); 5155 EVT L2InVT = L2In.getValueType(); 5156 5157 // Check the original type 5158 if (SrcVT == MVT::Other) 5159 SrcVT = L2InVT; 5160 else if (SrcVT != L2InVT) // Quit if non-homogenous typed. 5161 return SDValue(); 5162 5163 // Check whether the value being 'fpext'ed is extracted from the same 5164 // source. 5165 Opcode = L2In.getOpcode(); 5166 5167 // Quit if it's not extracted with a constant index. 5168 if (Opcode != ISD::EXTRACT_VECTOR_ELT || 5169 !isa<ConstantSDNode>(L2In.getOperand(1))) 5170 return SDValue(); 5171 5172 SDValue ExtractedFromVec = L2In.getOperand(0); 5173 5174 if (VecIn.getNode() == 0) { 5175 VecIn = ExtractedFromVec; 5176 VecInVT = ExtractedFromVec.getValueType(); 5177 } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec. 5178 return SDValue(); 5179 5180 Mask.push_back(cast<ConstantSDNode>(L2In.getOperand(1))->getZExtValue()); 5181 } 5182 5183 // Quit if all operands of BUILD_VECTOR are undefined. 5184 if (!VecIn.getNode()) 5185 return SDValue(); 5186 5187 // Fill the remaining mask as undef. 5188 for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i) 5189 Mask.push_back(-1); 5190 5191 return DAG.getNode(X86ISD::VFPEXT, DL, VT, 5192 DAG.getVectorShuffle(VecInVT, DL, 5193 VecIn, DAG.getUNDEF(VecInVT), 5194 &Mask[0])); 5195} 5196 5197SDValue 5198X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5199 DebugLoc dl = Op.getDebugLoc(); 5200 5201 EVT VT = Op.getValueType(); 5202 EVT ExtVT = VT.getVectorElementType(); 5203 unsigned NumElems = Op.getNumOperands(); 5204 5205 // Vectors containing all zeros can be matched by pxor and xorps later 5206 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5207 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 5208 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 5209 if (VT == MVT::v4i32 || VT == MVT::v8i32) 5210 return Op; 5211 5212 return getZeroVector(VT, Subtarget, DAG, dl); 5213 } 5214 5215 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 5216 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 5217 // vpcmpeqd on 256-bit vectors. 5218 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 5219 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2())) 5220 return Op; 5221 5222 return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl); 5223 } 5224 5225 SDValue Broadcast = LowerVectorBroadcast(Op, DAG); 5226 if (Broadcast.getNode()) 5227 return Broadcast; 5228 5229 SDValue FpExt = LowerVectorFpExtend(Op, DAG); 5230 if (FpExt.getNode()) 5231 return FpExt; 5232 5233 unsigned EVTBits = ExtVT.getSizeInBits(); 5234 5235 unsigned NumZero = 0; 5236 unsigned NumNonZero = 0; 5237 unsigned NonZeros = 0; 5238 bool IsAllConstants = true; 5239 SmallSet<SDValue, 8> Values; 5240 for (unsigned i = 0; i < NumElems; ++i) { 5241 SDValue Elt = Op.getOperand(i); 5242 if (Elt.getOpcode() == ISD::UNDEF) 5243 continue; 5244 Values.insert(Elt); 5245 if (Elt.getOpcode() != ISD::Constant && 5246 Elt.getOpcode() != ISD::ConstantFP) 5247 IsAllConstants = false; 5248 if (X86::isZeroNode(Elt)) 5249 NumZero++; 5250 else { 5251 NonZeros |= (1 << i); 5252 NumNonZero++; 5253 } 5254 } 5255 5256 // All undef vector. Return an UNDEF. All zero vectors were handled above. 5257 if (NumNonZero == 0) 5258 return DAG.getUNDEF(VT); 5259 5260 // Special case for single non-zero, non-undef, element. 5261 if (NumNonZero == 1) { 5262 unsigned Idx = CountTrailingZeros_32(NonZeros); 5263 SDValue Item = Op.getOperand(Idx); 5264 5265 // If this is an insertion of an i64 value on x86-32, and if the top bits of 5266 // the value are obviously zero, truncate the value to i32 and do the 5267 // insertion that way. Only do this if the value is non-constant or if the 5268 // value is a constant being inserted into element 0. It is cheaper to do 5269 // a constant pool load than it is to do a movd + shuffle. 5270 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5271 (!IsAllConstants || Idx == 0)) { 5272 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5273 // Handle SSE only. 5274 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5275 EVT VecVT = MVT::v4i32; 5276 unsigned VecElts = 4; 5277 5278 // Truncate the value (which may itself be a constant) to i32, and 5279 // convert it to a vector with movd (S2V+shuffle to zero extend). 5280 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5281 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5282 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5283 5284 // Now we have our 32-bit value zero extended in the low element of 5285 // a vector. If Idx != 0, swizzle it into place. 5286 if (Idx != 0) { 5287 SmallVector<int, 4> Mask; 5288 Mask.push_back(Idx); 5289 for (unsigned i = 1; i != VecElts; ++i) 5290 Mask.push_back(i); 5291 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), 5292 &Mask[0]); 5293 } 5294 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5295 } 5296 } 5297 5298 // If we have a constant or non-constant insertion into the low element of 5299 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5300 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5301 // depending on what the source datatype is. 5302 if (Idx == 0) { 5303 if (NumZero == 0) 5304 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5305 5306 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5307 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5308 if (VT.is256BitVector()) { 5309 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 5310 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 5311 Item, DAG.getIntPtrConstant(0)); 5312 } 5313 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5314 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5315 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5316 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5317 } 5318 5319 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5320 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5321 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 5322 if (VT.is256BitVector()) { 5323 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 5324 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); 5325 } else { 5326 assert(VT.is128BitVector() && "Expected an SSE value type!"); 5327 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5328 } 5329 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5330 } 5331 } 5332 5333 // Is it a vector logical left shift? 5334 if (NumElems == 2 && Idx == 1 && 5335 X86::isZeroNode(Op.getOperand(0)) && 5336 !X86::isZeroNode(Op.getOperand(1))) { 5337 unsigned NumBits = VT.getSizeInBits(); 5338 return getVShift(true, VT, 5339 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5340 VT, Op.getOperand(1)), 5341 NumBits/2, DAG, *this, dl); 5342 } 5343 5344 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5345 return SDValue(); 5346 5347 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5348 // is a non-constant being inserted into an element other than the low one, 5349 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5350 // movd/movss) to move this into the low element, then shuffle it into 5351 // place. 5352 if (EVTBits == 32) { 5353 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5354 5355 // Turn it into a shuffle of zero and zero-extended scalar to vector. 5356 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); 5357 SmallVector<int, 8> MaskVec; 5358 for (unsigned i = 0; i != NumElems; ++i) 5359 MaskVec.push_back(i == Idx ? 0 : 1); 5360 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 5361 } 5362 } 5363 5364 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5365 if (Values.size() == 1) { 5366 if (EVTBits == 32) { 5367 // Instead of a shuffle like this: 5368 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5369 // Check if it's possible to issue this instead. 5370 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5371 unsigned Idx = CountTrailingZeros_32(NonZeros); 5372 SDValue Item = Op.getOperand(Idx); 5373 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5374 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5375 } 5376 return SDValue(); 5377 } 5378 5379 // A vector full of immediates; various special cases are already 5380 // handled, so this is best done with a single constant-pool load. 5381 if (IsAllConstants) 5382 return SDValue(); 5383 5384 // For AVX-length vectors, build the individual 128-bit pieces and use 5385 // shuffles to put them in place. 5386 if (VT.is256BitVector()) { 5387 SmallVector<SDValue, 32> V; 5388 for (unsigned i = 0; i != NumElems; ++i) 5389 V.push_back(Op.getOperand(i)); 5390 5391 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 5392 5393 // Build both the lower and upper subvector. 5394 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 5395 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 5396 NumElems/2); 5397 5398 // Recreate the wider vector with the lower and upper part. 5399 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 5400 } 5401 5402 // Let legalizer expand 2-wide build_vectors. 5403 if (EVTBits == 64) { 5404 if (NumNonZero == 1) { 5405 // One half is zero or undef. 5406 unsigned Idx = CountTrailingZeros_32(NonZeros); 5407 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 5408 Op.getOperand(Idx)); 5409 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 5410 } 5411 return SDValue(); 5412 } 5413 5414 // If element VT is < 32 bits, convert it to inserts into a zero vector. 5415 if (EVTBits == 8 && NumElems == 16) { 5416 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 5417 Subtarget, *this); 5418 if (V.getNode()) return V; 5419 } 5420 5421 if (EVTBits == 16 && NumElems == 8) { 5422 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 5423 Subtarget, *this); 5424 if (V.getNode()) return V; 5425 } 5426 5427 // If element VT is == 32 bits, turn it into a number of shuffles. 5428 SmallVector<SDValue, 8> V(NumElems); 5429 if (NumElems == 4 && NumZero > 0) { 5430 for (unsigned i = 0; i < 4; ++i) { 5431 bool isZero = !(NonZeros & (1 << i)); 5432 if (isZero) 5433 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 5434 else 5435 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5436 } 5437 5438 for (unsigned i = 0; i < 2; ++i) { 5439 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 5440 default: break; 5441 case 0: 5442 V[i] = V[i*2]; // Must be a zero vector. 5443 break; 5444 case 1: 5445 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 5446 break; 5447 case 2: 5448 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 5449 break; 5450 case 3: 5451 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 5452 break; 5453 } 5454 } 5455 5456 bool Reverse1 = (NonZeros & 0x3) == 2; 5457 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 5458 int MaskVec[] = { 5459 Reverse1 ? 1 : 0, 5460 Reverse1 ? 0 : 1, 5461 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 5462 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 5463 }; 5464 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 5465 } 5466 5467 if (Values.size() > 1 && VT.is128BitVector()) { 5468 // Check for a build vector of consecutive loads. 5469 for (unsigned i = 0; i < NumElems; ++i) 5470 V[i] = Op.getOperand(i); 5471 5472 // Check for elements which are consecutive loads. 5473 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 5474 if (LD.getNode()) 5475 return LD; 5476 5477 // For SSE 4.1, use insertps to put the high elements into the low element. 5478 if (getSubtarget()->hasSSE41()) { 5479 SDValue Result; 5480 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 5481 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 5482 else 5483 Result = DAG.getUNDEF(VT); 5484 5485 for (unsigned i = 1; i < NumElems; ++i) { 5486 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 5487 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 5488 Op.getOperand(i), DAG.getIntPtrConstant(i)); 5489 } 5490 return Result; 5491 } 5492 5493 // Otherwise, expand into a number of unpckl*, start by extending each of 5494 // our (non-undef) elements to the full vector width with the element in the 5495 // bottom slot of the vector (which generates no code for SSE). 5496 for (unsigned i = 0; i < NumElems; ++i) { 5497 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 5498 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5499 else 5500 V[i] = DAG.getUNDEF(VT); 5501 } 5502 5503 // Next, we iteratively mix elements, e.g. for v4f32: 5504 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 5505 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 5506 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 5507 unsigned EltStride = NumElems >> 1; 5508 while (EltStride != 0) { 5509 for (unsigned i = 0; i < EltStride; ++i) { 5510 // If V[i+EltStride] is undef and this is the first round of mixing, 5511 // then it is safe to just drop this shuffle: V[i] is already in the 5512 // right place, the one element (since it's the first round) being 5513 // inserted as undef can be dropped. This isn't safe for successive 5514 // rounds because they will permute elements within both vectors. 5515 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 5516 EltStride == NumElems/2) 5517 continue; 5518 5519 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 5520 } 5521 EltStride >>= 1; 5522 } 5523 return V[0]; 5524 } 5525 return SDValue(); 5526} 5527 5528// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 5529// to create 256-bit vectors from two other 128-bit ones. 5530static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5531 DebugLoc dl = Op.getDebugLoc(); 5532 EVT ResVT = Op.getValueType(); 5533 5534 assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); 5535 5536 SDValue V1 = Op.getOperand(0); 5537 SDValue V2 = Op.getOperand(1); 5538 unsigned NumElems = ResVT.getVectorNumElements(); 5539 5540 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 5541} 5542 5543SDValue 5544X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 5545 assert(Op.getNumOperands() == 2); 5546 5547 // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors 5548 // from two other 128-bit ones. 5549 return LowerAVXCONCAT_VECTORS(Op, DAG); 5550} 5551 5552// Try to lower a shuffle node into a simple blend instruction. 5553static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, 5554 const X86Subtarget *Subtarget, 5555 SelectionDAG &DAG) { 5556 SDValue V1 = SVOp->getOperand(0); 5557 SDValue V2 = SVOp->getOperand(1); 5558 DebugLoc dl = SVOp->getDebugLoc(); 5559 MVT VT = SVOp->getValueType(0).getSimpleVT(); 5560 unsigned NumElems = VT.getVectorNumElements(); 5561 5562 if (!Subtarget->hasSSE41()) 5563 return SDValue(); 5564 5565 unsigned ISDNo = 0; 5566 MVT OpTy; 5567 5568 switch (VT.SimpleTy) { 5569 default: return SDValue(); 5570 case MVT::v8i16: 5571 ISDNo = X86ISD::BLENDPW; 5572 OpTy = MVT::v8i16; 5573 break; 5574 case MVT::v4i32: 5575 case MVT::v4f32: 5576 ISDNo = X86ISD::BLENDPS; 5577 OpTy = MVT::v4f32; 5578 break; 5579 case MVT::v2i64: 5580 case MVT::v2f64: 5581 ISDNo = X86ISD::BLENDPD; 5582 OpTy = MVT::v2f64; 5583 break; 5584 case MVT::v8i32: 5585 case MVT::v8f32: 5586 if (!Subtarget->hasAVX()) 5587 return SDValue(); 5588 ISDNo = X86ISD::BLENDPS; 5589 OpTy = MVT::v8f32; 5590 break; 5591 case MVT::v4i64: 5592 case MVT::v4f64: 5593 if (!Subtarget->hasAVX()) 5594 return SDValue(); 5595 ISDNo = X86ISD::BLENDPD; 5596 OpTy = MVT::v4f64; 5597 break; 5598 } 5599 assert(ISDNo && "Invalid Op Number"); 5600 5601 unsigned MaskVals = 0; 5602 5603 for (unsigned i = 0; i != NumElems; ++i) { 5604 int EltIdx = SVOp->getMaskElt(i); 5605 if (EltIdx == (int)i || EltIdx < 0) 5606 MaskVals |= (1<<i); 5607 else if (EltIdx == (int)(i + NumElems)) 5608 continue; // Bit is set to zero; 5609 else 5610 return SDValue(); 5611 } 5612 5613 V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1); 5614 V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2); 5615 SDValue Ret = DAG.getNode(ISDNo, dl, OpTy, V1, V2, 5616 DAG.getConstant(MaskVals, MVT::i32)); 5617 return DAG.getNode(ISD::BITCAST, dl, VT, Ret); 5618} 5619 5620// v8i16 shuffles - Prefer shuffles in the following order: 5621// 1. [all] pshuflw, pshufhw, optional move 5622// 2. [ssse3] 1 x pshufb 5623// 3. [ssse3] 2 x pshufb + 1 x por 5624// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 5625SDValue 5626X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 5627 SelectionDAG &DAG) const { 5628 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5629 SDValue V1 = SVOp->getOperand(0); 5630 SDValue V2 = SVOp->getOperand(1); 5631 DebugLoc dl = SVOp->getDebugLoc(); 5632 SmallVector<int, 8> MaskVals; 5633 5634 // Determine if more than 1 of the words in each of the low and high quadwords 5635 // of the result come from the same quadword of one of the two inputs. Undef 5636 // mask values count as coming from any quadword, for better codegen. 5637 unsigned LoQuad[] = { 0, 0, 0, 0 }; 5638 unsigned HiQuad[] = { 0, 0, 0, 0 }; 5639 std::bitset<4> InputQuads; 5640 for (unsigned i = 0; i < 8; ++i) { 5641 unsigned *Quad = i < 4 ? LoQuad : HiQuad; 5642 int EltIdx = SVOp->getMaskElt(i); 5643 MaskVals.push_back(EltIdx); 5644 if (EltIdx < 0) { 5645 ++Quad[0]; 5646 ++Quad[1]; 5647 ++Quad[2]; 5648 ++Quad[3]; 5649 continue; 5650 } 5651 ++Quad[EltIdx / 4]; 5652 InputQuads.set(EltIdx / 4); 5653 } 5654 5655 int BestLoQuad = -1; 5656 unsigned MaxQuad = 1; 5657 for (unsigned i = 0; i < 4; ++i) { 5658 if (LoQuad[i] > MaxQuad) { 5659 BestLoQuad = i; 5660 MaxQuad = LoQuad[i]; 5661 } 5662 } 5663 5664 int BestHiQuad = -1; 5665 MaxQuad = 1; 5666 for (unsigned i = 0; i < 4; ++i) { 5667 if (HiQuad[i] > MaxQuad) { 5668 BestHiQuad = i; 5669 MaxQuad = HiQuad[i]; 5670 } 5671 } 5672 5673 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 5674 // of the two input vectors, shuffle them into one input vector so only a 5675 // single pshufb instruction is necessary. If There are more than 2 input 5676 // quads, disable the next transformation since it does not help SSSE3. 5677 bool V1Used = InputQuads[0] || InputQuads[1]; 5678 bool V2Used = InputQuads[2] || InputQuads[3]; 5679 if (Subtarget->hasSSSE3()) { 5680 if (InputQuads.count() == 2 && V1Used && V2Used) { 5681 BestLoQuad = InputQuads[0] ? 0 : 1; 5682 BestHiQuad = InputQuads[2] ? 2 : 3; 5683 } 5684 if (InputQuads.count() > 2) { 5685 BestLoQuad = -1; 5686 BestHiQuad = -1; 5687 } 5688 } 5689 5690 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 5691 // the shuffle mask. If a quad is scored as -1, that means that it contains 5692 // words from all 4 input quadwords. 5693 SDValue NewV; 5694 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 5695 int MaskV[] = { 5696 BestLoQuad < 0 ? 0 : BestLoQuad, 5697 BestHiQuad < 0 ? 1 : BestHiQuad 5698 }; 5699 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 5700 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 5701 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 5702 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 5703 5704 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 5705 // source words for the shuffle, to aid later transformations. 5706 bool AllWordsInNewV = true; 5707 bool InOrder[2] = { true, true }; 5708 for (unsigned i = 0; i != 8; ++i) { 5709 int idx = MaskVals[i]; 5710 if (idx != (int)i) 5711 InOrder[i/4] = false; 5712 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 5713 continue; 5714 AllWordsInNewV = false; 5715 break; 5716 } 5717 5718 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5719 if (AllWordsInNewV) { 5720 for (int i = 0; i != 8; ++i) { 5721 int idx = MaskVals[i]; 5722 if (idx < 0) 5723 continue; 5724 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5725 if ((idx != i) && idx < 4) 5726 pshufhw = false; 5727 if ((idx != i) && idx > 3) 5728 pshuflw = false; 5729 } 5730 V1 = NewV; 5731 V2Used = false; 5732 BestLoQuad = 0; 5733 BestHiQuad = 1; 5734 } 5735 5736 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5737 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5738 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5739 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5740 unsigned TargetMask = 0; 5741 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5742 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5743 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5744 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): 5745 getShufflePSHUFLWImmediate(SVOp); 5746 V1 = NewV.getOperand(0); 5747 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5748 } 5749 } 5750 5751 // If we have SSSE3, and all words of the result are from 1 input vector, 5752 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5753 // is present, fall back to case 4. 5754 if (Subtarget->hasSSSE3()) { 5755 SmallVector<SDValue,16> pshufbMask; 5756 5757 // If we have elements from both input vectors, set the high bit of the 5758 // shuffle mask element to zero out elements that come from V2 in the V1 5759 // mask, and elements that come from V1 in the V2 mask, so that the two 5760 // results can be OR'd together. 5761 bool TwoInputs = V1Used && V2Used; 5762 for (unsigned i = 0; i != 8; ++i) { 5763 int EltIdx = MaskVals[i] * 2; 5764 int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; 5765 int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; 5766 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 5767 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 5768 } 5769 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5770 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5771 DAG.getNode(ISD::BUILD_VECTOR, dl, 5772 MVT::v16i8, &pshufbMask[0], 16)); 5773 if (!TwoInputs) 5774 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5775 5776 // Calculate the shuffle mask for the second input, shuffle it, and 5777 // OR it with the first shuffled input. 5778 pshufbMask.clear(); 5779 for (unsigned i = 0; i != 8; ++i) { 5780 int EltIdx = MaskVals[i] * 2; 5781 int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; 5782 int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; 5783 pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); 5784 pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); 5785 } 5786 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5787 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5788 DAG.getNode(ISD::BUILD_VECTOR, dl, 5789 MVT::v16i8, &pshufbMask[0], 16)); 5790 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5791 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5792 } 5793 5794 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5795 // and update MaskVals with new element order. 5796 std::bitset<8> InOrder; 5797 if (BestLoQuad >= 0) { 5798 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; 5799 for (int i = 0; i != 4; ++i) { 5800 int idx = MaskVals[i]; 5801 if (idx < 0) { 5802 InOrder.set(i); 5803 } else if ((idx / 4) == BestLoQuad) { 5804 MaskV[i] = idx & 3; 5805 InOrder.set(i); 5806 } 5807 } 5808 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5809 &MaskV[0]); 5810 5811 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 5812 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5813 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5814 NewV.getOperand(0), 5815 getShufflePSHUFLWImmediate(SVOp), DAG); 5816 } 5817 } 5818 5819 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5820 // and update MaskVals with the new element order. 5821 if (BestHiQuad >= 0) { 5822 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; 5823 for (unsigned i = 4; i != 8; ++i) { 5824 int idx = MaskVals[i]; 5825 if (idx < 0) { 5826 InOrder.set(i); 5827 } else if ((idx / 4) == BestHiQuad) { 5828 MaskV[i] = (idx & 3) + 4; 5829 InOrder.set(i); 5830 } 5831 } 5832 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5833 &MaskV[0]); 5834 5835 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 5836 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5837 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5838 NewV.getOperand(0), 5839 getShufflePSHUFHWImmediate(SVOp), DAG); 5840 } 5841 } 5842 5843 // In case BestHi & BestLo were both -1, which means each quadword has a word 5844 // from each of the four input quadwords, calculate the InOrder bitvector now 5845 // before falling through to the insert/extract cleanup. 5846 if (BestLoQuad == -1 && BestHiQuad == -1) { 5847 NewV = V1; 5848 for (int i = 0; i != 8; ++i) 5849 if (MaskVals[i] < 0 || MaskVals[i] == i) 5850 InOrder.set(i); 5851 } 5852 5853 // The other elements are put in the right place using pextrw and pinsrw. 5854 for (unsigned i = 0; i != 8; ++i) { 5855 if (InOrder[i]) 5856 continue; 5857 int EltIdx = MaskVals[i]; 5858 if (EltIdx < 0) 5859 continue; 5860 SDValue ExtOp = (EltIdx < 8) ? 5861 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5862 DAG.getIntPtrConstant(EltIdx)) : 5863 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 5864 DAG.getIntPtrConstant(EltIdx - 8)); 5865 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 5866 DAG.getIntPtrConstant(i)); 5867 } 5868 return NewV; 5869} 5870 5871// v16i8 shuffles - Prefer shuffles in the following order: 5872// 1. [ssse3] 1 x pshufb 5873// 2. [ssse3] 2 x pshufb + 1 x por 5874// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 5875static 5876SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 5877 SelectionDAG &DAG, 5878 const X86TargetLowering &TLI) { 5879 SDValue V1 = SVOp->getOperand(0); 5880 SDValue V2 = SVOp->getOperand(1); 5881 DebugLoc dl = SVOp->getDebugLoc(); 5882 ArrayRef<int> MaskVals = SVOp->getMask(); 5883 5884 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5885 5886 // If we have SSSE3, case 1 is generated when all result bytes come from 5887 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5888 // present, fall back to case 3. 5889 5890 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5891 if (TLI.getSubtarget()->hasSSSE3()) { 5892 SmallVector<SDValue,16> pshufbMask; 5893 5894 // If all result elements are from one input vector, then only translate 5895 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5896 // 5897 // Otherwise, we have elements from both input vectors, and must zero out 5898 // elements that come from V2 in the first mask, and V1 in the second mask 5899 // so that we can OR them together. 5900 for (unsigned i = 0; i != 16; ++i) { 5901 int EltIdx = MaskVals[i]; 5902 if (EltIdx < 0 || EltIdx >= 16) 5903 EltIdx = 0x80; 5904 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5905 } 5906 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5907 DAG.getNode(ISD::BUILD_VECTOR, dl, 5908 MVT::v16i8, &pshufbMask[0], 16)); 5909 if (V2IsUndef) 5910 return V1; 5911 5912 // Calculate the shuffle mask for the second input, shuffle it, and 5913 // OR it with the first shuffled input. 5914 pshufbMask.clear(); 5915 for (unsigned i = 0; i != 16; ++i) { 5916 int EltIdx = MaskVals[i]; 5917 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; 5918 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5919 } 5920 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5921 DAG.getNode(ISD::BUILD_VECTOR, dl, 5922 MVT::v16i8, &pshufbMask[0], 16)); 5923 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5924 } 5925 5926 // No SSSE3 - Calculate in place words and then fix all out of place words 5927 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5928 // the 16 different words that comprise the two doublequadword input vectors. 5929 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5930 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5931 SDValue NewV = V1; 5932 for (int i = 0; i != 8; ++i) { 5933 int Elt0 = MaskVals[i*2]; 5934 int Elt1 = MaskVals[i*2+1]; 5935 5936 // This word of the result is all undef, skip it. 5937 if (Elt0 < 0 && Elt1 < 0) 5938 continue; 5939 5940 // This word of the result is already in the correct place, skip it. 5941 if ((Elt0 == i*2) && (Elt1 == i*2+1)) 5942 continue; 5943 5944 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5945 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5946 SDValue InsElt; 5947 5948 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5949 // using a single extract together, load it and store it. 5950 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5951 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5952 DAG.getIntPtrConstant(Elt1 / 2)); 5953 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5954 DAG.getIntPtrConstant(i)); 5955 continue; 5956 } 5957 5958 // If Elt1 is defined, extract it from the appropriate source. If the 5959 // source byte is not also odd, shift the extracted word left 8 bits 5960 // otherwise clear the bottom 8 bits if we need to do an or. 5961 if (Elt1 >= 0) { 5962 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5963 DAG.getIntPtrConstant(Elt1 / 2)); 5964 if ((Elt1 & 1) == 0) 5965 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5966 DAG.getConstant(8, 5967 TLI.getShiftAmountTy(InsElt.getValueType()))); 5968 else if (Elt0 >= 0) 5969 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5970 DAG.getConstant(0xFF00, MVT::i16)); 5971 } 5972 // If Elt0 is defined, extract it from the appropriate source. If the 5973 // source byte is not also even, shift the extracted word right 8 bits. If 5974 // Elt1 was also defined, OR the extracted values together before 5975 // inserting them in the result. 5976 if (Elt0 >= 0) { 5977 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5978 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5979 if ((Elt0 & 1) != 0) 5980 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5981 DAG.getConstant(8, 5982 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5983 else if (Elt1 >= 0) 5984 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5985 DAG.getConstant(0x00FF, MVT::i16)); 5986 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5987 : InsElt0; 5988 } 5989 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5990 DAG.getIntPtrConstant(i)); 5991 } 5992 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5993} 5994 5995/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 5996/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 5997/// done when every pair / quad of shuffle mask elements point to elements in 5998/// the right sequence. e.g. 5999/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 6000static 6001SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 6002 SelectionDAG &DAG, DebugLoc dl) { 6003 MVT VT = SVOp->getValueType(0).getSimpleVT(); 6004 unsigned NumElems = VT.getVectorNumElements(); 6005 MVT NewVT; 6006 unsigned Scale; 6007 switch (VT.SimpleTy) { 6008 default: llvm_unreachable("Unexpected!"); 6009 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; 6010 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; 6011 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; 6012 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; 6013 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; 6014 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; 6015 } 6016 6017 SmallVector<int, 8> MaskVec; 6018 for (unsigned i = 0; i != NumElems; i += Scale) { 6019 int StartIdx = -1; 6020 for (unsigned j = 0; j != Scale; ++j) { 6021 int EltIdx = SVOp->getMaskElt(i+j); 6022 if (EltIdx < 0) 6023 continue; 6024 if (StartIdx < 0) 6025 StartIdx = (EltIdx / Scale); 6026 if (EltIdx != (int)(StartIdx*Scale + j)) 6027 return SDValue(); 6028 } 6029 MaskVec.push_back(StartIdx); 6030 } 6031 6032 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); 6033 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); 6034 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 6035} 6036 6037/// getVZextMovL - Return a zero-extending vector move low node. 6038/// 6039static SDValue getVZextMovL(EVT VT, EVT OpVT, 6040 SDValue SrcOp, SelectionDAG &DAG, 6041 const X86Subtarget *Subtarget, DebugLoc dl) { 6042 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 6043 LoadSDNode *LD = NULL; 6044 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 6045 LD = dyn_cast<LoadSDNode>(SrcOp); 6046 if (!LD) { 6047 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 6048 // instead. 6049 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 6050 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 6051 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 6052 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 6053 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 6054 // PR2108 6055 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 6056 return DAG.getNode(ISD::BITCAST, dl, VT, 6057 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6058 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6059 OpVT, 6060 SrcOp.getOperand(0) 6061 .getOperand(0)))); 6062 } 6063 } 6064 } 6065 6066 return DAG.getNode(ISD::BITCAST, dl, VT, 6067 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 6068 DAG.getNode(ISD::BITCAST, dl, 6069 OpVT, SrcOp))); 6070} 6071 6072/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 6073/// which could not be matched by any known target speficic shuffle 6074static SDValue 6075LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6076 6077 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); 6078 if (NewOp.getNode()) 6079 return NewOp; 6080 6081 EVT VT = SVOp->getValueType(0); 6082 6083 unsigned NumElems = VT.getVectorNumElements(); 6084 unsigned NumLaneElems = NumElems / 2; 6085 6086 DebugLoc dl = SVOp->getDebugLoc(); 6087 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 6088 EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); 6089 SDValue Output[2]; 6090 6091 SmallVector<int, 16> Mask; 6092 for (unsigned l = 0; l < 2; ++l) { 6093 // Build a shuffle mask for the output, discovering on the fly which 6094 // input vectors to use as shuffle operands (recorded in InputUsed). 6095 // If building a suitable shuffle vector proves too hard, then bail 6096 // out with UseBuildVector set. 6097 bool UseBuildVector = false; 6098 int InputUsed[2] = { -1, -1 }; // Not yet discovered. 6099 unsigned LaneStart = l * NumLaneElems; 6100 for (unsigned i = 0; i != NumLaneElems; ++i) { 6101 // The mask element. This indexes into the input. 6102 int Idx = SVOp->getMaskElt(i+LaneStart); 6103 if (Idx < 0) { 6104 // the mask element does not index into any input vector. 6105 Mask.push_back(-1); 6106 continue; 6107 } 6108 6109 // The input vector this mask element indexes into. 6110 int Input = Idx / NumLaneElems; 6111 6112 // Turn the index into an offset from the start of the input vector. 6113 Idx -= Input * NumLaneElems; 6114 6115 // Find or create a shuffle vector operand to hold this input. 6116 unsigned OpNo; 6117 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { 6118 if (InputUsed[OpNo] == Input) 6119 // This input vector is already an operand. 6120 break; 6121 if (InputUsed[OpNo] < 0) { 6122 // Create a new operand for this input vector. 6123 InputUsed[OpNo] = Input; 6124 break; 6125 } 6126 } 6127 6128 if (OpNo >= array_lengthof(InputUsed)) { 6129 // More than two input vectors used! Give up on trying to create a 6130 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 6131 UseBuildVector = true; 6132 break; 6133 } 6134 6135 // Add the mask index for the new shuffle vector. 6136 Mask.push_back(Idx + OpNo * NumLaneElems); 6137 } 6138 6139 if (UseBuildVector) { 6140 SmallVector<SDValue, 16> SVOps; 6141 for (unsigned i = 0; i != NumLaneElems; ++i) { 6142 // The mask element. This indexes into the input. 6143 int Idx = SVOp->getMaskElt(i+LaneStart); 6144 if (Idx < 0) { 6145 SVOps.push_back(DAG.getUNDEF(EltVT)); 6146 continue; 6147 } 6148 6149 // The input vector this mask element indexes into. 6150 int Input = Idx / NumElems; 6151 6152 // Turn the index into an offset from the start of the input vector. 6153 Idx -= Input * NumElems; 6154 6155 // Extract the vector element by hand. 6156 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 6157 SVOp->getOperand(Input), 6158 DAG.getIntPtrConstant(Idx))); 6159 } 6160 6161 // Construct the output using a BUILD_VECTOR. 6162 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], 6163 SVOps.size()); 6164 } else if (InputUsed[0] < 0) { 6165 // No input vectors were used! The result is undefined. 6166 Output[l] = DAG.getUNDEF(NVT); 6167 } else { 6168 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), 6169 (InputUsed[0] % 2) * NumLaneElems, 6170 DAG, dl); 6171 // If only one input was used, use an undefined vector for the other. 6172 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : 6173 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), 6174 (InputUsed[1] % 2) * NumLaneElems, DAG, dl); 6175 // At least one input vector was used. Create a new shuffle vector. 6176 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); 6177 } 6178 6179 Mask.clear(); 6180 } 6181 6182 // Concatenate the result back 6183 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); 6184} 6185 6186/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 6187/// 4 elements, and match them with several different shuffle types. 6188static SDValue 6189LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6190 SDValue V1 = SVOp->getOperand(0); 6191 SDValue V2 = SVOp->getOperand(1); 6192 DebugLoc dl = SVOp->getDebugLoc(); 6193 EVT VT = SVOp->getValueType(0); 6194 6195 assert(VT.is128BitVector() && "Unsupported vector size"); 6196 6197 std::pair<int, int> Locs[4]; 6198 int Mask1[] = { -1, -1, -1, -1 }; 6199 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); 6200 6201 unsigned NumHi = 0; 6202 unsigned NumLo = 0; 6203 for (unsigned i = 0; i != 4; ++i) { 6204 int Idx = PermMask[i]; 6205 if (Idx < 0) { 6206 Locs[i] = std::make_pair(-1, -1); 6207 } else { 6208 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 6209 if (Idx < 4) { 6210 Locs[i] = std::make_pair(0, NumLo); 6211 Mask1[NumLo] = Idx; 6212 NumLo++; 6213 } else { 6214 Locs[i] = std::make_pair(1, NumHi); 6215 if (2+NumHi < 4) 6216 Mask1[2+NumHi] = Idx; 6217 NumHi++; 6218 } 6219 } 6220 } 6221 6222 if (NumLo <= 2 && NumHi <= 2) { 6223 // If no more than two elements come from either vector. This can be 6224 // implemented with two shuffles. First shuffle gather the elements. 6225 // The second shuffle, which takes the first shuffle as both of its 6226 // vector operands, put the elements into the right order. 6227 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6228 6229 int Mask2[] = { -1, -1, -1, -1 }; 6230 6231 for (unsigned i = 0; i != 4; ++i) 6232 if (Locs[i].first != -1) { 6233 unsigned Idx = (i < 2) ? 0 : 4; 6234 Idx += Locs[i].first * 2 + Locs[i].second; 6235 Mask2[i] = Idx; 6236 } 6237 6238 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 6239 } 6240 6241 if (NumLo == 3 || NumHi == 3) { 6242 // Otherwise, we must have three elements from one vector, call it X, and 6243 // one element from the other, call it Y. First, use a shufps to build an 6244 // intermediate vector with the one element from Y and the element from X 6245 // that will be in the same half in the final destination (the indexes don't 6246 // matter). Then, use a shufps to build the final vector, taking the half 6247 // containing the element from Y from the intermediate, and the other half 6248 // from X. 6249 if (NumHi == 3) { 6250 // Normalize it so the 3 elements come from V1. 6251 CommuteVectorShuffleMask(PermMask, 4); 6252 std::swap(V1, V2); 6253 } 6254 6255 // Find the element from V2. 6256 unsigned HiIndex; 6257 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 6258 int Val = PermMask[HiIndex]; 6259 if (Val < 0) 6260 continue; 6261 if (Val >= 4) 6262 break; 6263 } 6264 6265 Mask1[0] = PermMask[HiIndex]; 6266 Mask1[1] = -1; 6267 Mask1[2] = PermMask[HiIndex^1]; 6268 Mask1[3] = -1; 6269 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6270 6271 if (HiIndex >= 2) { 6272 Mask1[0] = PermMask[0]; 6273 Mask1[1] = PermMask[1]; 6274 Mask1[2] = HiIndex & 1 ? 6 : 4; 6275 Mask1[3] = HiIndex & 1 ? 4 : 6; 6276 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6277 } 6278 6279 Mask1[0] = HiIndex & 1 ? 2 : 0; 6280 Mask1[1] = HiIndex & 1 ? 0 : 2; 6281 Mask1[2] = PermMask[2]; 6282 Mask1[3] = PermMask[3]; 6283 if (Mask1[2] >= 0) 6284 Mask1[2] += 4; 6285 if (Mask1[3] >= 0) 6286 Mask1[3] += 4; 6287 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 6288 } 6289 6290 // Break it into (shuffle shuffle_hi, shuffle_lo). 6291 int LoMask[] = { -1, -1, -1, -1 }; 6292 int HiMask[] = { -1, -1, -1, -1 }; 6293 6294 int *MaskPtr = LoMask; 6295 unsigned MaskIdx = 0; 6296 unsigned LoIdx = 0; 6297 unsigned HiIdx = 2; 6298 for (unsigned i = 0; i != 4; ++i) { 6299 if (i == 2) { 6300 MaskPtr = HiMask; 6301 MaskIdx = 1; 6302 LoIdx = 0; 6303 HiIdx = 2; 6304 } 6305 int Idx = PermMask[i]; 6306 if (Idx < 0) { 6307 Locs[i] = std::make_pair(-1, -1); 6308 } else if (Idx < 4) { 6309 Locs[i] = std::make_pair(MaskIdx, LoIdx); 6310 MaskPtr[LoIdx] = Idx; 6311 LoIdx++; 6312 } else { 6313 Locs[i] = std::make_pair(MaskIdx, HiIdx); 6314 MaskPtr[HiIdx] = Idx; 6315 HiIdx++; 6316 } 6317 } 6318 6319 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 6320 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 6321 int MaskOps[] = { -1, -1, -1, -1 }; 6322 for (unsigned i = 0; i != 4; ++i) 6323 if (Locs[i].first != -1) 6324 MaskOps[i] = Locs[i].first * 4 + Locs[i].second; 6325 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 6326} 6327 6328static bool MayFoldVectorLoad(SDValue V) { 6329 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6330 V = V.getOperand(0); 6331 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6332 V = V.getOperand(0); 6333 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && 6334 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) 6335 // BUILD_VECTOR (load), undef 6336 V = V.getOperand(0); 6337 if (MayFoldLoad(V)) 6338 return true; 6339 return false; 6340} 6341 6342// FIXME: the version above should always be used. Since there's 6343// a bug where several vector shuffles can't be folded because the 6344// DAG is not updated during lowering and a node claims to have two 6345// uses while it only has one, use this version, and let isel match 6346// another instruction if the load really happens to have more than 6347// one use. Remove this version after this bug get fixed. 6348// rdar://8434668, PR8156 6349static bool RelaxedMayFoldVectorLoad(SDValue V) { 6350 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6351 V = V.getOperand(0); 6352 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6353 V = V.getOperand(0); 6354 if (ISD::isNormalLoad(V.getNode())) 6355 return true; 6356 return false; 6357} 6358 6359static 6360SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 6361 EVT VT = Op.getValueType(); 6362 6363 // Canonizalize to v2f64. 6364 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 6365 return DAG.getNode(ISD::BITCAST, dl, VT, 6366 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 6367 V1, DAG)); 6368} 6369 6370static 6371SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 6372 bool HasSSE2) { 6373 SDValue V1 = Op.getOperand(0); 6374 SDValue V2 = Op.getOperand(1); 6375 EVT VT = Op.getValueType(); 6376 6377 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 6378 6379 if (HasSSE2 && VT == MVT::v2f64) 6380 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 6381 6382 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) 6383 return DAG.getNode(ISD::BITCAST, dl, VT, 6384 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, 6385 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), 6386 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); 6387} 6388 6389static 6390SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 6391 SDValue V1 = Op.getOperand(0); 6392 SDValue V2 = Op.getOperand(1); 6393 EVT VT = Op.getValueType(); 6394 6395 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 6396 "unsupported shuffle type"); 6397 6398 if (V2.getOpcode() == ISD::UNDEF) 6399 V2 = V1; 6400 6401 // v4i32 or v4f32 6402 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 6403} 6404 6405static 6406SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 6407 SDValue V1 = Op.getOperand(0); 6408 SDValue V2 = Op.getOperand(1); 6409 EVT VT = Op.getValueType(); 6410 unsigned NumElems = VT.getVectorNumElements(); 6411 6412 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 6413 // operand of these instructions is only memory, so check if there's a 6414 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 6415 // same masks. 6416 bool CanFoldLoad = false; 6417 6418 // Trivial case, when V2 comes from a load. 6419 if (MayFoldVectorLoad(V2)) 6420 CanFoldLoad = true; 6421 6422 // When V1 is a load, it can be folded later into a store in isel, example: 6423 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 6424 // turns into: 6425 // (MOVLPSmr addr:$src1, VR128:$src2) 6426 // So, recognize this potential and also use MOVLPS or MOVLPD 6427 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 6428 CanFoldLoad = true; 6429 6430 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6431 if (CanFoldLoad) { 6432 if (HasSSE2 && NumElems == 2) 6433 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 6434 6435 if (NumElems == 4) 6436 // If we don't care about the second element, proceed to use movss. 6437 if (SVOp->getMaskElt(1) != -1) 6438 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 6439 } 6440 6441 // movl and movlp will both match v2i64, but v2i64 is never matched by 6442 // movl earlier because we make it strict to avoid messing with the movlp load 6443 // folding logic (see the code above getMOVLP call). Match it here then, 6444 // this is horrible, but will stay like this until we move all shuffle 6445 // matching to x86 specific nodes. Note that for the 1st condition all 6446 // types are matched with movsd. 6447 if (HasSSE2) { 6448 // FIXME: isMOVLMask should be checked and matched before getMOVLP, 6449 // as to remove this logic from here, as much as possible 6450 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) 6451 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6452 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6453 } 6454 6455 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 6456 6457 // Invert the operand order and use SHUFPS to match it. 6458 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, 6459 getShuffleSHUFImmediate(SVOp), DAG); 6460} 6461 6462SDValue 6463X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { 6464 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6465 EVT VT = Op.getValueType(); 6466 DebugLoc dl = Op.getDebugLoc(); 6467 SDValue V1 = Op.getOperand(0); 6468 SDValue V2 = Op.getOperand(1); 6469 6470 if (isZeroShuffle(SVOp)) 6471 return getZeroVector(VT, Subtarget, DAG, dl); 6472 6473 // Handle splat operations 6474 if (SVOp->isSplat()) { 6475 unsigned NumElem = VT.getVectorNumElements(); 6476 int Size = VT.getSizeInBits(); 6477 6478 // Use vbroadcast whenever the splat comes from a foldable load 6479 SDValue Broadcast = LowerVectorBroadcast(Op, DAG); 6480 if (Broadcast.getNode()) 6481 return Broadcast; 6482 6483 // Handle splats by matching through known shuffle masks 6484 if ((Size == 128 && NumElem <= 4) || 6485 (Size == 256 && NumElem < 8)) 6486 return SDValue(); 6487 6488 // All remaning splats are promoted to target supported vector shuffles. 6489 return PromoteSplat(SVOp, DAG); 6490 } 6491 6492 // If the shuffle can be profitably rewritten as a narrower shuffle, then 6493 // do it! 6494 if (VT == MVT::v8i16 || VT == MVT::v16i8 || 6495 VT == MVT::v16i16 || VT == MVT::v32i8) { 6496 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6497 if (NewOp.getNode()) 6498 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 6499 } else if ((VT == MVT::v4i32 || 6500 (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 6501 // FIXME: Figure out a cleaner way to do this. 6502 // Try to make use of movq to zero out the top part. 6503 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 6504 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6505 if (NewOp.getNode()) { 6506 EVT NewVT = NewOp.getValueType(); 6507 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), 6508 NewVT, true, false)) 6509 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), 6510 DAG, Subtarget, dl); 6511 } 6512 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 6513 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6514 if (NewOp.getNode()) { 6515 EVT NewVT = NewOp.getValueType(); 6516 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) 6517 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), 6518 DAG, Subtarget, dl); 6519 } 6520 } 6521 } 6522 return SDValue(); 6523} 6524 6525SDValue 6526X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 6527 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6528 SDValue V1 = Op.getOperand(0); 6529 SDValue V2 = Op.getOperand(1); 6530 EVT VT = Op.getValueType(); 6531 DebugLoc dl = Op.getDebugLoc(); 6532 unsigned NumElems = VT.getVectorNumElements(); 6533 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 6534 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6535 bool V1IsSplat = false; 6536 bool V2IsSplat = false; 6537 bool HasSSE2 = Subtarget->hasSSE2(); 6538 bool HasAVX = Subtarget->hasAVX(); 6539 bool HasAVX2 = Subtarget->hasAVX2(); 6540 MachineFunction &MF = DAG.getMachineFunction(); 6541 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 6542 6543 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 6544 6545 if (V1IsUndef && V2IsUndef) 6546 return DAG.getUNDEF(VT); 6547 6548 assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); 6549 6550 // Vector shuffle lowering takes 3 steps: 6551 // 6552 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 6553 // narrowing and commutation of operands should be handled. 6554 // 2) Matching of shuffles with known shuffle masks to x86 target specific 6555 // shuffle nodes. 6556 // 3) Rewriting of unmatched masks into new generic shuffle operations, 6557 // so the shuffle can be broken into other shuffles and the legalizer can 6558 // try the lowering again. 6559 // 6560 // The general idea is that no vector_shuffle operation should be left to 6561 // be matched during isel, all of them must be converted to a target specific 6562 // node here. 6563 6564 // Normalize the input vectors. Here splats, zeroed vectors, profitable 6565 // narrowing and commutation of operands should be handled. The actual code 6566 // doesn't include all of those, work in progress... 6567 SDValue NewOp = NormalizeVectorShuffle(Op, DAG); 6568 if (NewOp.getNode()) 6569 return NewOp; 6570 6571 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); 6572 6573 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 6574 // unpckh_undef). Only use pshufd if speed is more important than size. 6575 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) 6576 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6577 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) 6578 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6579 6580 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && 6581 V2IsUndef && RelaxedMayFoldVectorLoad(V1)) 6582 return getMOVDDup(Op, dl, V1, DAG); 6583 6584 if (isMOVHLPS_v_undef_Mask(M, VT)) 6585 return getMOVHighToLow(Op, dl, DAG); 6586 6587 // Use to match splats 6588 if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef && 6589 (VT == MVT::v2f64 || VT == MVT::v2i64)) 6590 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6591 6592 if (isPSHUFDMask(M, VT)) { 6593 // The actual implementation will match the mask in the if above and then 6594 // during isel it can match several different instructions, not only pshufd 6595 // as its name says, sad but true, emulate the behavior for now... 6596 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 6597 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 6598 6599 unsigned TargetMask = getShuffleSHUFImmediate(SVOp); 6600 6601 if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64)) 6602 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG); 6603 6604 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 6605 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 6606 6607 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, 6608 TargetMask, DAG); 6609 } 6610 6611 // Check if this can be converted into a logical shift. 6612 bool isLeft = false; 6613 unsigned ShAmt = 0; 6614 SDValue ShVal; 6615 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 6616 if (isShift && ShVal.hasOneUse()) { 6617 // If the shifted value has multiple uses, it may be cheaper to use 6618 // v_set0 + movlhps or movhlps, etc. 6619 EVT EltVT = VT.getVectorElementType(); 6620 ShAmt *= EltVT.getSizeInBits(); 6621 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6622 } 6623 6624 if (isMOVLMask(M, VT)) { 6625 if (ISD::isBuildVectorAllZeros(V1.getNode())) 6626 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 6627 if (!isMOVLPMask(M, VT)) { 6628 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6629 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6630 6631 if (VT == MVT::v4i32 || VT == MVT::v4f32) 6632 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6633 } 6634 } 6635 6636 // FIXME: fold these into legal mask. 6637 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2)) 6638 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 6639 6640 if (isMOVHLPSMask(M, VT)) 6641 return getMOVHighToLow(Op, dl, DAG); 6642 6643 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) 6644 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 6645 6646 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) 6647 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 6648 6649 if (isMOVLPMask(M, VT)) 6650 return getMOVLP(Op, dl, DAG, HasSSE2); 6651 6652 if (ShouldXformToMOVHLPS(M, VT) || 6653 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) 6654 return CommuteVectorShuffle(SVOp, DAG); 6655 6656 if (isShift) { 6657 // No better options. Use a vshldq / vsrldq. 6658 EVT EltVT = VT.getVectorElementType(); 6659 ShAmt *= EltVT.getSizeInBits(); 6660 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6661 } 6662 6663 bool Commuted = false; 6664 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6665 // 1,1,1,1 -> v8i16 though. 6666 V1IsSplat = isSplatVector(V1.getNode()); 6667 V2IsSplat = isSplatVector(V2.getNode()); 6668 6669 // Canonicalize the splat or undef, if present, to be on the RHS. 6670 if (!V2IsUndef && V1IsSplat && !V2IsSplat) { 6671 CommuteVectorShuffleMask(M, NumElems); 6672 std::swap(V1, V2); 6673 std::swap(V1IsSplat, V2IsSplat); 6674 Commuted = true; 6675 } 6676 6677 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { 6678 // Shuffling low element of v1 into undef, just return v1. 6679 if (V2IsUndef) 6680 return V1; 6681 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6682 // the instruction selector will not match, so get a canonical MOVL with 6683 // swapped operands to undo the commute. 6684 return getMOVL(DAG, dl, VT, V2, V1); 6685 } 6686 6687 if (isUNPCKLMask(M, VT, HasAVX2)) 6688 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6689 6690 if (isUNPCKHMask(M, VT, HasAVX2)) 6691 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6692 6693 if (V2IsSplat) { 6694 // Normalize mask so all entries that point to V2 points to its first 6695 // element then try to match unpck{h|l} again. If match, return a 6696 // new vector_shuffle with the corrected mask.p 6697 SmallVector<int, 8> NewMask(M.begin(), M.end()); 6698 NormalizeMask(NewMask, NumElems); 6699 if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) 6700 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6701 if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) 6702 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6703 } 6704 6705 if (Commuted) { 6706 // Commute is back and try unpck* again. 6707 // FIXME: this seems wrong. 6708 CommuteVectorShuffleMask(M, NumElems); 6709 std::swap(V1, V2); 6710 std::swap(V1IsSplat, V2IsSplat); 6711 Commuted = false; 6712 6713 if (isUNPCKLMask(M, VT, HasAVX2)) 6714 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6715 6716 if (isUNPCKHMask(M, VT, HasAVX2)) 6717 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6718 } 6719 6720 // Normalize the node to match x86 shuffle ops if needed 6721 if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true))) 6722 return CommuteVectorShuffle(SVOp, DAG); 6723 6724 // The checks below are all present in isShuffleMaskLegal, but they are 6725 // inlined here right now to enable us to directly emit target specific 6726 // nodes, and remove one by one until they don't return Op anymore. 6727 6728 if (isPALIGNRMask(M, VT, Subtarget)) 6729 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 6730 getShufflePALIGNRImmediate(SVOp), 6731 DAG); 6732 6733 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6734 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6735 if (VT == MVT::v2f64 || VT == MVT::v2i64) 6736 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6737 } 6738 6739 if (isPSHUFHWMask(M, VT, HasAVX2)) 6740 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6741 getShufflePSHUFHWImmediate(SVOp), 6742 DAG); 6743 6744 if (isPSHUFLWMask(M, VT, HasAVX2)) 6745 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 6746 getShufflePSHUFLWImmediate(SVOp), 6747 DAG); 6748 6749 if (isSHUFPMask(M, VT, HasAVX)) 6750 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, 6751 getShuffleSHUFImmediate(SVOp), DAG); 6752 6753 if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) 6754 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6755 if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) 6756 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6757 6758 //===--------------------------------------------------------------------===// 6759 // Generate target specific nodes for 128 or 256-bit shuffles only 6760 // supported in the AVX instruction set. 6761 // 6762 6763 // Handle VMOVDDUPY permutations 6764 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX)) 6765 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 6766 6767 // Handle VPERMILPS/D* permutations 6768 if (isVPERMILPMask(M, VT, HasAVX)) { 6769 if (HasAVX2 && VT == MVT::v8i32) 6770 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, 6771 getShuffleSHUFImmediate(SVOp), DAG); 6772 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, 6773 getShuffleSHUFImmediate(SVOp), DAG); 6774 } 6775 6776 // Handle VPERM2F128/VPERM2I128 permutations 6777 if (isVPERM2X128Mask(M, VT, HasAVX)) 6778 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, 6779 V2, getShuffleVPERM2X128Immediate(SVOp), DAG); 6780 6781 SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); 6782 if (BlendOp.getNode()) 6783 return BlendOp; 6784 6785 if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { 6786 SmallVector<SDValue, 8> permclMask; 6787 for (unsigned i = 0; i != 8; ++i) { 6788 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32)); 6789 } 6790 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, 6791 &permclMask[0], 8); 6792 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 6793 return DAG.getNode(X86ISD::VPERMV, dl, VT, 6794 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); 6795 } 6796 6797 if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64)) 6798 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, 6799 getShuffleCLImmediate(SVOp), DAG); 6800 6801 6802 //===--------------------------------------------------------------------===// 6803 // Since no target specific shuffle was selected for this generic one, 6804 // lower it into other known shuffles. FIXME: this isn't true yet, but 6805 // this is the plan. 6806 // 6807 6808 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 6809 if (VT == MVT::v8i16) { 6810 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 6811 if (NewOp.getNode()) 6812 return NewOp; 6813 } 6814 6815 if (VT == MVT::v16i8) { 6816 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 6817 if (NewOp.getNode()) 6818 return NewOp; 6819 } 6820 6821 // Handle all 128-bit wide vectors with 4 elements, and match them with 6822 // several different shuffle types. 6823 if (NumElems == 4 && VT.is128BitVector()) 6824 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 6825 6826 // Handle general 256-bit shuffles 6827 if (VT.is256BitVector()) 6828 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 6829 6830 return SDValue(); 6831} 6832 6833SDValue 6834X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 6835 SelectionDAG &DAG) const { 6836 EVT VT = Op.getValueType(); 6837 DebugLoc dl = Op.getDebugLoc(); 6838 6839 if (!Op.getOperand(0).getValueType().is128BitVector()) 6840 return SDValue(); 6841 6842 if (VT.getSizeInBits() == 8) { 6843 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 6844 Op.getOperand(0), Op.getOperand(1)); 6845 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6846 DAG.getValueType(VT)); 6847 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6848 } 6849 6850 if (VT.getSizeInBits() == 16) { 6851 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6852 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 6853 if (Idx == 0) 6854 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6855 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6856 DAG.getNode(ISD::BITCAST, dl, 6857 MVT::v4i32, 6858 Op.getOperand(0)), 6859 Op.getOperand(1))); 6860 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 6861 Op.getOperand(0), Op.getOperand(1)); 6862 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6863 DAG.getValueType(VT)); 6864 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6865 } 6866 6867 if (VT == MVT::f32) { 6868 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 6869 // the result back to FR32 register. It's only worth matching if the 6870 // result has a single use which is a store or a bitcast to i32. And in 6871 // the case of a store, it's not worth it if the index is a constant 0, 6872 // because a MOVSSmr can be used instead, which is smaller and faster. 6873 if (!Op.hasOneUse()) 6874 return SDValue(); 6875 SDNode *User = *Op.getNode()->use_begin(); 6876 if ((User->getOpcode() != ISD::STORE || 6877 (isa<ConstantSDNode>(Op.getOperand(1)) && 6878 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 6879 (User->getOpcode() != ISD::BITCAST || 6880 User->getValueType(0) != MVT::i32)) 6881 return SDValue(); 6882 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6883 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 6884 Op.getOperand(0)), 6885 Op.getOperand(1)); 6886 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 6887 } 6888 6889 if (VT == MVT::i32 || VT == MVT::i64) { 6890 // ExtractPS/pextrq works with constant index. 6891 if (isa<ConstantSDNode>(Op.getOperand(1))) 6892 return Op; 6893 } 6894 return SDValue(); 6895} 6896 6897 6898SDValue 6899X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6900 SelectionDAG &DAG) const { 6901 if (!isa<ConstantSDNode>(Op.getOperand(1))) 6902 return SDValue(); 6903 6904 SDValue Vec = Op.getOperand(0); 6905 EVT VecVT = Vec.getValueType(); 6906 6907 // If this is a 256-bit vector result, first extract the 128-bit vector and 6908 // then extract the element from the 128-bit vector. 6909 if (VecVT.is256BitVector()) { 6910 DebugLoc dl = Op.getNode()->getDebugLoc(); 6911 unsigned NumElems = VecVT.getVectorNumElements(); 6912 SDValue Idx = Op.getOperand(1); 6913 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 6914 6915 // Get the 128-bit vector. 6916 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); 6917 6918 if (IdxVal >= NumElems/2) 6919 IdxVal -= NumElems/2; 6920 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6921 DAG.getConstant(IdxVal, MVT::i32)); 6922 } 6923 6924 assert(VecVT.is128BitVector() && "Unexpected vector length"); 6925 6926 if (Subtarget->hasSSE41()) { 6927 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6928 if (Res.getNode()) 6929 return Res; 6930 } 6931 6932 EVT VT = Op.getValueType(); 6933 DebugLoc dl = Op.getDebugLoc(); 6934 // TODO: handle v16i8. 6935 if (VT.getSizeInBits() == 16) { 6936 SDValue Vec = Op.getOperand(0); 6937 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6938 if (Idx == 0) 6939 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6940 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6941 DAG.getNode(ISD::BITCAST, dl, 6942 MVT::v4i32, Vec), 6943 Op.getOperand(1))); 6944 // Transform it so it match pextrw which produces a 32-bit result. 6945 EVT EltVT = MVT::i32; 6946 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6947 Op.getOperand(0), Op.getOperand(1)); 6948 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6949 DAG.getValueType(VT)); 6950 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6951 } 6952 6953 if (VT.getSizeInBits() == 32) { 6954 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6955 if (Idx == 0) 6956 return Op; 6957 6958 // SHUFPS the element to the lowest double word, then movss. 6959 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 6960 EVT VVT = Op.getOperand(0).getValueType(); 6961 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6962 DAG.getUNDEF(VVT), Mask); 6963 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6964 DAG.getIntPtrConstant(0)); 6965 } 6966 6967 if (VT.getSizeInBits() == 64) { 6968 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6969 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6970 // to match extract_elt for f64. 6971 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6972 if (Idx == 0) 6973 return Op; 6974 6975 // UNPCKHPD the element to the lowest double word, then movsd. 6976 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6977 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6978 int Mask[2] = { 1, -1 }; 6979 EVT VVT = Op.getOperand(0).getValueType(); 6980 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6981 DAG.getUNDEF(VVT), Mask); 6982 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6983 DAG.getIntPtrConstant(0)); 6984 } 6985 6986 return SDValue(); 6987} 6988 6989SDValue 6990X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 6991 SelectionDAG &DAG) const { 6992 EVT VT = Op.getValueType(); 6993 EVT EltVT = VT.getVectorElementType(); 6994 DebugLoc dl = Op.getDebugLoc(); 6995 6996 SDValue N0 = Op.getOperand(0); 6997 SDValue N1 = Op.getOperand(1); 6998 SDValue N2 = Op.getOperand(2); 6999 7000 if (!VT.is128BitVector()) 7001 return SDValue(); 7002 7003 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 7004 isa<ConstantSDNode>(N2)) { 7005 unsigned Opc; 7006 if (VT == MVT::v8i16) 7007 Opc = X86ISD::PINSRW; 7008 else if (VT == MVT::v16i8) 7009 Opc = X86ISD::PINSRB; 7010 else 7011 Opc = X86ISD::PINSRB; 7012 7013 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 7014 // argument. 7015 if (N1.getValueType() != MVT::i32) 7016 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7017 if (N2.getValueType() != MVT::i32) 7018 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7019 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 7020 } 7021 7022 if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 7023 // Bits [7:6] of the constant are the source select. This will always be 7024 // zero here. The DAG Combiner may combine an extract_elt index into these 7025 // bits. For example (insert (extract, 3), 2) could be matched by putting 7026 // the '3' into bits [7:6] of X86ISD::INSERTPS. 7027 // Bits [5:4] of the constant are the destination select. This is the 7028 // value of the incoming immediate. 7029 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 7030 // combine either bitwise AND or insert of float 0.0 to set these bits. 7031 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 7032 // Create this as a scalar to vector.. 7033 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 7034 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 7035 } 7036 7037 if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { 7038 // PINSR* works with constant index. 7039 return Op; 7040 } 7041 return SDValue(); 7042} 7043 7044SDValue 7045X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 7046 EVT VT = Op.getValueType(); 7047 EVT EltVT = VT.getVectorElementType(); 7048 7049 DebugLoc dl = Op.getDebugLoc(); 7050 SDValue N0 = Op.getOperand(0); 7051 SDValue N1 = Op.getOperand(1); 7052 SDValue N2 = Op.getOperand(2); 7053 7054 // If this is a 256-bit vector result, first extract the 128-bit vector, 7055 // insert the element into the extracted half and then place it back. 7056 if (VT.is256BitVector()) { 7057 if (!isa<ConstantSDNode>(N2)) 7058 return SDValue(); 7059 7060 // Get the desired 128-bit vector half. 7061 unsigned NumElems = VT.getVectorNumElements(); 7062 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 7063 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); 7064 7065 // Insert the element into the desired half. 7066 bool Upper = IdxVal >= NumElems/2; 7067 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 7068 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32)); 7069 7070 // Insert the changed part back to the 256-bit vector 7071 return Insert128BitVector(N0, V, IdxVal, DAG, dl); 7072 } 7073 7074 if (Subtarget->hasSSE41()) 7075 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 7076 7077 if (EltVT == MVT::i8) 7078 return SDValue(); 7079 7080 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 7081 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 7082 // as its second argument. 7083 if (N1.getValueType() != MVT::i32) 7084 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 7085 if (N2.getValueType() != MVT::i32) 7086 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 7087 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 7088 } 7089 return SDValue(); 7090} 7091 7092SDValue 7093X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 7094 LLVMContext *Context = DAG.getContext(); 7095 DebugLoc dl = Op.getDebugLoc(); 7096 EVT OpVT = Op.getValueType(); 7097 7098 // If this is a 256-bit vector result, first insert into a 128-bit 7099 // vector and then insert into the 256-bit vector. 7100 if (!OpVT.is128BitVector()) { 7101 // Insert into a 128-bit vector. 7102 EVT VT128 = EVT::getVectorVT(*Context, 7103 OpVT.getVectorElementType(), 7104 OpVT.getVectorNumElements() / 2); 7105 7106 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 7107 7108 // Insert the 128-bit vector. 7109 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 7110 } 7111 7112 if (OpVT == MVT::v1i64 && 7113 Op.getOperand(0).getValueType() == MVT::i64) 7114 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 7115 7116 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 7117 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 7118 return DAG.getNode(ISD::BITCAST, dl, OpVT, 7119 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 7120} 7121 7122// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 7123// a simple subregister reference or explicit instructions to grab 7124// upper bits of a vector. 7125SDValue 7126X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 7127 if (Subtarget->hasAVX()) { 7128 DebugLoc dl = Op.getNode()->getDebugLoc(); 7129 SDValue Vec = Op.getNode()->getOperand(0); 7130 SDValue Idx = Op.getNode()->getOperand(1); 7131 7132 if (Op.getNode()->getValueType(0).is128BitVector() && 7133 Vec.getNode()->getValueType(0).is256BitVector() && 7134 isa<ConstantSDNode>(Idx)) { 7135 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7136 return Extract128BitVector(Vec, IdxVal, DAG, dl); 7137 } 7138 } 7139 return SDValue(); 7140} 7141 7142// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 7143// simple superregister reference or explicit instructions to insert 7144// the upper bits of a vector. 7145SDValue 7146X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 7147 if (Subtarget->hasAVX()) { 7148 DebugLoc dl = Op.getNode()->getDebugLoc(); 7149 SDValue Vec = Op.getNode()->getOperand(0); 7150 SDValue SubVec = Op.getNode()->getOperand(1); 7151 SDValue Idx = Op.getNode()->getOperand(2); 7152 7153 if (Op.getNode()->getValueType(0).is256BitVector() && 7154 SubVec.getNode()->getValueType(0).is128BitVector() && 7155 isa<ConstantSDNode>(Idx)) { 7156 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 7157 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 7158 } 7159 } 7160 return SDValue(); 7161} 7162 7163// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 7164// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 7165// one of the above mentioned nodes. It has to be wrapped because otherwise 7166// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 7167// be used to form addressing mode. These wrapped nodes will be selected 7168// into MOV32ri. 7169SDValue 7170X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 7171 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 7172 7173 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7174 // global base reg. 7175 unsigned char OpFlag = 0; 7176 unsigned WrapperKind = X86ISD::Wrapper; 7177 CodeModel::Model M = getTargetMachine().getCodeModel(); 7178 7179 if (Subtarget->isPICStyleRIPRel() && 7180 (M == CodeModel::Small || M == CodeModel::Kernel)) 7181 WrapperKind = X86ISD::WrapperRIP; 7182 else if (Subtarget->isPICStyleGOT()) 7183 OpFlag = X86II::MO_GOTOFF; 7184 else if (Subtarget->isPICStyleStubPIC()) 7185 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7186 7187 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 7188 CP->getAlignment(), 7189 CP->getOffset(), OpFlag); 7190 DebugLoc DL = CP->getDebugLoc(); 7191 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7192 // With PIC, the address is actually $g + Offset. 7193 if (OpFlag) { 7194 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7195 DAG.getNode(X86ISD::GlobalBaseReg, 7196 DebugLoc(), getPointerTy()), 7197 Result); 7198 } 7199 7200 return Result; 7201} 7202 7203SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 7204 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 7205 7206 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7207 // global base reg. 7208 unsigned char OpFlag = 0; 7209 unsigned WrapperKind = X86ISD::Wrapper; 7210 CodeModel::Model M = getTargetMachine().getCodeModel(); 7211 7212 if (Subtarget->isPICStyleRIPRel() && 7213 (M == CodeModel::Small || M == CodeModel::Kernel)) 7214 WrapperKind = X86ISD::WrapperRIP; 7215 else if (Subtarget->isPICStyleGOT()) 7216 OpFlag = X86II::MO_GOTOFF; 7217 else if (Subtarget->isPICStyleStubPIC()) 7218 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7219 7220 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 7221 OpFlag); 7222 DebugLoc DL = JT->getDebugLoc(); 7223 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7224 7225 // With PIC, the address is actually $g + Offset. 7226 if (OpFlag) 7227 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7228 DAG.getNode(X86ISD::GlobalBaseReg, 7229 DebugLoc(), getPointerTy()), 7230 Result); 7231 7232 return Result; 7233} 7234 7235SDValue 7236X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 7237 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 7238 7239 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7240 // global base reg. 7241 unsigned char OpFlag = 0; 7242 unsigned WrapperKind = X86ISD::Wrapper; 7243 CodeModel::Model M = getTargetMachine().getCodeModel(); 7244 7245 if (Subtarget->isPICStyleRIPRel() && 7246 (M == CodeModel::Small || M == CodeModel::Kernel)) { 7247 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 7248 OpFlag = X86II::MO_GOTPCREL; 7249 WrapperKind = X86ISD::WrapperRIP; 7250 } else if (Subtarget->isPICStyleGOT()) { 7251 OpFlag = X86II::MO_GOT; 7252 } else if (Subtarget->isPICStyleStubPIC()) { 7253 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 7254 } else if (Subtarget->isPICStyleStubNoDynamic()) { 7255 OpFlag = X86II::MO_DARWIN_NONLAZY; 7256 } 7257 7258 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 7259 7260 DebugLoc DL = Op.getDebugLoc(); 7261 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7262 7263 7264 // With PIC, the address is actually $g + Offset. 7265 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 7266 !Subtarget->is64Bit()) { 7267 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7268 DAG.getNode(X86ISD::GlobalBaseReg, 7269 DebugLoc(), getPointerTy()), 7270 Result); 7271 } 7272 7273 // For symbols that require a load from a stub to get the address, emit the 7274 // load. 7275 if (isGlobalStubReference(OpFlag)) 7276 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 7277 MachinePointerInfo::getGOT(), false, false, false, 0); 7278 7279 return Result; 7280} 7281 7282SDValue 7283X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 7284 // Create the TargetBlockAddressAddress node. 7285 unsigned char OpFlags = 7286 Subtarget->ClassifyBlockAddressReference(); 7287 CodeModel::Model M = getTargetMachine().getCodeModel(); 7288 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 7289 DebugLoc dl = Op.getDebugLoc(); 7290 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 7291 /*isTarget=*/true, OpFlags); 7292 7293 if (Subtarget->isPICStyleRIPRel() && 7294 (M == CodeModel::Small || M == CodeModel::Kernel)) 7295 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7296 else 7297 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7298 7299 // With PIC, the address is actually $g + Offset. 7300 if (isGlobalRelativeToPICBase(OpFlags)) { 7301 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7302 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7303 Result); 7304 } 7305 7306 return Result; 7307} 7308 7309SDValue 7310X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 7311 int64_t Offset, 7312 SelectionDAG &DAG) const { 7313 // Create the TargetGlobalAddress node, folding in the constant 7314 // offset if it is legal. 7315 unsigned char OpFlags = 7316 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 7317 CodeModel::Model M = getTargetMachine().getCodeModel(); 7318 SDValue Result; 7319 if (OpFlags == X86II::MO_NO_FLAG && 7320 X86::isOffsetSuitableForCodeModel(Offset, M)) { 7321 // A direct static reference to a global. 7322 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 7323 Offset = 0; 7324 } else { 7325 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 7326 } 7327 7328 if (Subtarget->isPICStyleRIPRel() && 7329 (M == CodeModel::Small || M == CodeModel::Kernel)) 7330 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7331 else 7332 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7333 7334 // With PIC, the address is actually $g + Offset. 7335 if (isGlobalRelativeToPICBase(OpFlags)) { 7336 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7337 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7338 Result); 7339 } 7340 7341 // For globals that require a load from a stub to get the address, emit the 7342 // load. 7343 if (isGlobalStubReference(OpFlags)) 7344 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 7345 MachinePointerInfo::getGOT(), false, false, false, 0); 7346 7347 // If there was a non-zero offset that we didn't fold, create an explicit 7348 // addition for it. 7349 if (Offset != 0) 7350 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 7351 DAG.getConstant(Offset, getPointerTy())); 7352 7353 return Result; 7354} 7355 7356SDValue 7357X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 7358 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 7359 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 7360 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 7361} 7362 7363static SDValue 7364GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 7365 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 7366 unsigned char OperandFlags, bool LocalDynamic = false) { 7367 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7368 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7369 DebugLoc dl = GA->getDebugLoc(); 7370 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7371 GA->getValueType(0), 7372 GA->getOffset(), 7373 OperandFlags); 7374 7375 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR 7376 : X86ISD::TLSADDR; 7377 7378 if (InFlag) { 7379 SDValue Ops[] = { Chain, TGA, *InFlag }; 7380 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3); 7381 } else { 7382 SDValue Ops[] = { Chain, TGA }; 7383 Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2); 7384 } 7385 7386 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 7387 MFI->setAdjustsStack(true); 7388 7389 SDValue Flag = Chain.getValue(1); 7390 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 7391} 7392 7393// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 7394static SDValue 7395LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7396 const EVT PtrVT) { 7397 SDValue InFlag; 7398 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 7399 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 7400 DAG.getNode(X86ISD::GlobalBaseReg, 7401 DebugLoc(), PtrVT), InFlag); 7402 InFlag = Chain.getValue(1); 7403 7404 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 7405} 7406 7407// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 7408static SDValue 7409LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7410 const EVT PtrVT) { 7411 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 7412 X86::RAX, X86II::MO_TLSGD); 7413} 7414 7415static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, 7416 SelectionDAG &DAG, 7417 const EVT PtrVT, 7418 bool is64Bit) { 7419 DebugLoc dl = GA->getDebugLoc(); 7420 7421 // Get the start address of the TLS block for this module. 7422 X86MachineFunctionInfo* MFI = DAG.getMachineFunction() 7423 .getInfo<X86MachineFunctionInfo>(); 7424 MFI->incNumLocalDynamicTLSAccesses(); 7425 7426 SDValue Base; 7427 if (is64Bit) { 7428 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, 7429 X86II::MO_TLSLD, /*LocalDynamic=*/true); 7430 } else { 7431 SDValue InFlag; 7432 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 7433 DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag); 7434 InFlag = Chain.getValue(1); 7435 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, 7436 X86II::MO_TLSLDM, /*LocalDynamic=*/true); 7437 } 7438 7439 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations 7440 // of Base. 7441 7442 // Build x@dtpoff. 7443 unsigned char OperandFlags = X86II::MO_DTPOFF; 7444 unsigned WrapperKind = X86ISD::Wrapper; 7445 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7446 GA->getValueType(0), 7447 GA->getOffset(), OperandFlags); 7448 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 7449 7450 // Add x@dtpoff with the base. 7451 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); 7452} 7453 7454// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. 7455static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7456 const EVT PtrVT, TLSModel::Model model, 7457 bool is64Bit, bool isPIC) { 7458 DebugLoc dl = GA->getDebugLoc(); 7459 7460 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 7461 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 7462 is64Bit ? 257 : 256)); 7463 7464 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 7465 DAG.getIntPtrConstant(0), 7466 MachinePointerInfo(Ptr), 7467 false, false, false, 0); 7468 7469 unsigned char OperandFlags = 0; 7470 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 7471 // initialexec. 7472 unsigned WrapperKind = X86ISD::Wrapper; 7473 if (model == TLSModel::LocalExec) { 7474 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 7475 } else if (model == TLSModel::InitialExec) { 7476 if (is64Bit) { 7477 OperandFlags = X86II::MO_GOTTPOFF; 7478 WrapperKind = X86ISD::WrapperRIP; 7479 } else { 7480 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; 7481 } 7482 } else { 7483 llvm_unreachable("Unexpected model"); 7484 } 7485 7486 // emit "addl x@ntpoff,%eax" (local exec) 7487 // or "addl x@indntpoff,%eax" (initial exec) 7488 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) 7489 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7490 GA->getValueType(0), 7491 GA->getOffset(), OperandFlags); 7492 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 7493 7494 if (model == TLSModel::InitialExec) { 7495 if (isPIC && !is64Bit) { 7496 Offset = DAG.getNode(ISD::ADD, dl, PtrVT, 7497 DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), 7498 Offset); 7499 } 7500 7501 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 7502 MachinePointerInfo::getGOT(), false, false, false, 7503 0); 7504 } 7505 7506 // The address of the thread local variable is the add of the thread 7507 // pointer with the offset of the variable. 7508 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 7509} 7510 7511SDValue 7512X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 7513 7514 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 7515 const GlobalValue *GV = GA->getGlobal(); 7516 7517 if (Subtarget->isTargetELF()) { 7518 TLSModel::Model model = getTargetMachine().getTLSModel(GV); 7519 7520 switch (model) { 7521 case TLSModel::GeneralDynamic: 7522 if (Subtarget->is64Bit()) 7523 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 7524 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 7525 case TLSModel::LocalDynamic: 7526 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), 7527 Subtarget->is64Bit()); 7528 case TLSModel::InitialExec: 7529 case TLSModel::LocalExec: 7530 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 7531 Subtarget->is64Bit(), 7532 getTargetMachine().getRelocationModel() == Reloc::PIC_); 7533 } 7534 llvm_unreachable("Unknown TLS model."); 7535 } 7536 7537 if (Subtarget->isTargetDarwin()) { 7538 // Darwin only has one model of TLS. Lower to that. 7539 unsigned char OpFlag = 0; 7540 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 7541 X86ISD::WrapperRIP : X86ISD::Wrapper; 7542 7543 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7544 // global base reg. 7545 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 7546 !Subtarget->is64Bit(); 7547 if (PIC32) 7548 OpFlag = X86II::MO_TLVP_PIC_BASE; 7549 else 7550 OpFlag = X86II::MO_TLVP; 7551 DebugLoc DL = Op.getDebugLoc(); 7552 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 7553 GA->getValueType(0), 7554 GA->getOffset(), OpFlag); 7555 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7556 7557 // With PIC32, the address is actually $g + Offset. 7558 if (PIC32) 7559 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7560 DAG.getNode(X86ISD::GlobalBaseReg, 7561 DebugLoc(), getPointerTy()), 7562 Offset); 7563 7564 // Lowering the machine isd will make sure everything is in the right 7565 // location. 7566 SDValue Chain = DAG.getEntryNode(); 7567 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7568 SDValue Args[] = { Chain, Offset }; 7569 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 7570 7571 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 7572 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7573 MFI->setAdjustsStack(true); 7574 7575 // And our return value (tls address) is in the standard call return value 7576 // location. 7577 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 7578 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), 7579 Chain.getValue(1)); 7580 } 7581 7582 if (Subtarget->isTargetWindows()) { 7583 // Just use the implicit TLS architecture 7584 // Need to generate someting similar to: 7585 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 7586 // ; from TEB 7587 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 7588 // mov rcx, qword [rdx+rcx*8] 7589 // mov eax, .tls$:tlsvar 7590 // [rax+rcx] contains the address 7591 // Windows 64bit: gs:0x58 7592 // Windows 32bit: fs:__tls_array 7593 7594 // If GV is an alias then use the aliasee for determining 7595 // thread-localness. 7596 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 7597 GV = GA->resolveAliasedGlobal(false); 7598 DebugLoc dl = GA->getDebugLoc(); 7599 SDValue Chain = DAG.getEntryNode(); 7600 7601 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 7602 // %gs:0x58 (64-bit). 7603 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() 7604 ? Type::getInt8PtrTy(*DAG.getContext(), 7605 256) 7606 : Type::getInt32PtrTy(*DAG.getContext(), 7607 257)); 7608 7609 SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, 7610 Subtarget->is64Bit() 7611 ? DAG.getIntPtrConstant(0x58) 7612 : DAG.getExternalSymbol("_tls_array", 7613 getPointerTy()), 7614 MachinePointerInfo(Ptr), 7615 false, false, false, 0); 7616 7617 // Load the _tls_index variable 7618 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); 7619 if (Subtarget->is64Bit()) 7620 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, 7621 IDX, MachinePointerInfo(), MVT::i32, 7622 false, false, 0); 7623 else 7624 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), 7625 false, false, false, 0); 7626 7627 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), 7628 getPointerTy()); 7629 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); 7630 7631 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); 7632 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), 7633 false, false, false, 0); 7634 7635 // Get the offset of start of .tls section 7636 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7637 GA->getValueType(0), 7638 GA->getOffset(), X86II::MO_SECREL); 7639 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); 7640 7641 // The address of the thread local variable is the add of the thread 7642 // pointer with the offset of the variable. 7643 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); 7644 } 7645 7646 llvm_unreachable("TLS not implemented for this target."); 7647} 7648 7649 7650/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values 7651/// and take a 2 x i32 value to shift plus a shift amount. 7652SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ 7653 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 7654 EVT VT = Op.getValueType(); 7655 unsigned VTBits = VT.getSizeInBits(); 7656 DebugLoc dl = Op.getDebugLoc(); 7657 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 7658 SDValue ShOpLo = Op.getOperand(0); 7659 SDValue ShOpHi = Op.getOperand(1); 7660 SDValue ShAmt = Op.getOperand(2); 7661 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 7662 DAG.getConstant(VTBits - 1, MVT::i8)) 7663 : DAG.getConstant(0, VT); 7664 7665 SDValue Tmp2, Tmp3; 7666 if (Op.getOpcode() == ISD::SHL_PARTS) { 7667 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 7668 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 7669 } else { 7670 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 7671 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 7672 } 7673 7674 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 7675 DAG.getConstant(VTBits, MVT::i8)); 7676 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 7677 AndNode, DAG.getConstant(0, MVT::i8)); 7678 7679 SDValue Hi, Lo; 7680 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7681 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 7682 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 7683 7684 if (Op.getOpcode() == ISD::SHL_PARTS) { 7685 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7686 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7687 } else { 7688 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7689 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7690 } 7691 7692 SDValue Ops[2] = { Lo, Hi }; 7693 return DAG.getMergeValues(Ops, 2, dl); 7694} 7695 7696SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 7697 SelectionDAG &DAG) const { 7698 EVT SrcVT = Op.getOperand(0).getValueType(); 7699 7700 if (SrcVT.isVector()) 7701 return SDValue(); 7702 7703 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 7704 "Unknown SINT_TO_FP to lower!"); 7705 7706 // These are really Legal; return the operand so the caller accepts it as 7707 // Legal. 7708 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 7709 return Op; 7710 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 7711 Subtarget->is64Bit()) { 7712 return Op; 7713 } 7714 7715 DebugLoc dl = Op.getDebugLoc(); 7716 unsigned Size = SrcVT.getSizeInBits()/8; 7717 MachineFunction &MF = DAG.getMachineFunction(); 7718 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 7719 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7720 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7721 StackSlot, 7722 MachinePointerInfo::getFixedStack(SSFI), 7723 false, false, 0); 7724 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 7725} 7726 7727SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 7728 SDValue StackSlot, 7729 SelectionDAG &DAG) const { 7730 // Build the FILD 7731 DebugLoc DL = Op.getDebugLoc(); 7732 SDVTList Tys; 7733 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 7734 if (useSSE) 7735 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 7736 else 7737 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 7738 7739 unsigned ByteSize = SrcVT.getSizeInBits()/8; 7740 7741 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 7742 MachineMemOperand *MMO; 7743 if (FI) { 7744 int SSFI = FI->getIndex(); 7745 MMO = 7746 DAG.getMachineFunction() 7747 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7748 MachineMemOperand::MOLoad, ByteSize, ByteSize); 7749 } else { 7750 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 7751 StackSlot = StackSlot.getOperand(1); 7752 } 7753 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 7754 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 7755 X86ISD::FILD, DL, 7756 Tys, Ops, array_lengthof(Ops), 7757 SrcVT, MMO); 7758 7759 if (useSSE) { 7760 Chain = Result.getValue(1); 7761 SDValue InFlag = Result.getValue(2); 7762 7763 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 7764 // shouldn't be necessary except that RFP cannot be live across 7765 // multiple blocks. When stackifier is fixed, they can be uncoupled. 7766 MachineFunction &MF = DAG.getMachineFunction(); 7767 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 7768 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 7769 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7770 Tys = DAG.getVTList(MVT::Other); 7771 SDValue Ops[] = { 7772 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 7773 }; 7774 MachineMemOperand *MMO = 7775 DAG.getMachineFunction() 7776 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7777 MachineMemOperand::MOStore, SSFISize, SSFISize); 7778 7779 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 7780 Ops, array_lengthof(Ops), 7781 Op.getValueType(), MMO); 7782 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 7783 MachinePointerInfo::getFixedStack(SSFI), 7784 false, false, false, 0); 7785 } 7786 7787 return Result; 7788} 7789 7790// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 7791SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 7792 SelectionDAG &DAG) const { 7793 // This algorithm is not obvious. Here it is what we're trying to output: 7794 /* 7795 movq %rax, %xmm0 7796 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 7797 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 7798 #ifdef __SSE3__ 7799 haddpd %xmm0, %xmm0 7800 #else 7801 pshufd $0x4e, %xmm0, %xmm1 7802 addpd %xmm1, %xmm0 7803 #endif 7804 */ 7805 7806 DebugLoc dl = Op.getDebugLoc(); 7807 LLVMContext *Context = DAG.getContext(); 7808 7809 // Build some magic constants. 7810 const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 7811 Constant *C0 = ConstantDataVector::get(*Context, CV0); 7812 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 7813 7814 SmallVector<Constant*,2> CV1; 7815 CV1.push_back( 7816 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 7817 CV1.push_back( 7818 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 7819 Constant *C1 = ConstantVector::get(CV1); 7820 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 7821 7822 // Load the 64-bit value into an XMM register. 7823 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 7824 Op.getOperand(0)); 7825 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 7826 MachinePointerInfo::getConstantPool(), 7827 false, false, false, 16); 7828 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, 7829 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), 7830 CLod0); 7831 7832 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 7833 MachinePointerInfo::getConstantPool(), 7834 false, false, false, 16); 7835 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); 7836 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 7837 SDValue Result; 7838 7839 if (Subtarget->hasSSE3()) { 7840 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 7841 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 7842 } else { 7843 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); 7844 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 7845 S2F, 0x4E, DAG); 7846 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 7847 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), 7848 Sub); 7849 } 7850 7851 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 7852 DAG.getIntPtrConstant(0)); 7853} 7854 7855// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 7856SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 7857 SelectionDAG &DAG) const { 7858 DebugLoc dl = Op.getDebugLoc(); 7859 // FP constant to bias correct the final result. 7860 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 7861 MVT::f64); 7862 7863 // Load the 32-bit value into an XMM register. 7864 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7865 Op.getOperand(0)); 7866 7867 // Zero out the upper parts of the register. 7868 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 7869 7870 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7871 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 7872 DAG.getIntPtrConstant(0)); 7873 7874 // Or the load with the bias. 7875 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 7876 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7877 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7878 MVT::v2f64, Load)), 7879 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7880 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7881 MVT::v2f64, Bias))); 7882 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7883 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 7884 DAG.getIntPtrConstant(0)); 7885 7886 // Subtract the bias. 7887 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 7888 7889 // Handle final rounding. 7890 EVT DestVT = Op.getValueType(); 7891 7892 if (DestVT.bitsLT(MVT::f64)) 7893 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 7894 DAG.getIntPtrConstant(0)); 7895 if (DestVT.bitsGT(MVT::f64)) 7896 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 7897 7898 // Handle final rounding. 7899 return Sub; 7900} 7901 7902SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 7903 SelectionDAG &DAG) const { 7904 SDValue N0 = Op.getOperand(0); 7905 DebugLoc dl = Op.getDebugLoc(); 7906 7907 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 7908 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 7909 // the optimization here. 7910 if (DAG.SignBitIsZero(N0)) 7911 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 7912 7913 EVT SrcVT = N0.getValueType(); 7914 EVT DstVT = Op.getValueType(); 7915 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 7916 return LowerUINT_TO_FP_i64(Op, DAG); 7917 if (SrcVT == MVT::i32 && X86ScalarSSEf64) 7918 return LowerUINT_TO_FP_i32(Op, DAG); 7919 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) 7920 return SDValue(); 7921 7922 // Make a 64-bit buffer, and use it to build an FILD. 7923 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 7924 if (SrcVT == MVT::i32) { 7925 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 7926 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 7927 getPointerTy(), StackSlot, WordOff); 7928 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7929 StackSlot, MachinePointerInfo(), 7930 false, false, 0); 7931 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 7932 OffsetSlot, MachinePointerInfo(), 7933 false, false, 0); 7934 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 7935 return Fild; 7936 } 7937 7938 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 7939 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7940 StackSlot, MachinePointerInfo(), 7941 false, false, 0); 7942 // For i64 source, we need to add the appropriate power of 2 if the input 7943 // was negative. This is the same as the optimization in 7944 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 7945 // we must be careful to do the computation in x87 extended precision, not 7946 // in SSE. (The generic code can't know it's OK to do this, or how to.) 7947 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 7948 MachineMemOperand *MMO = 7949 DAG.getMachineFunction() 7950 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7951 MachineMemOperand::MOLoad, 8, 8); 7952 7953 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 7954 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 7955 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 7956 MVT::i64, MMO); 7957 7958 APInt FF(32, 0x5F800000ULL); 7959 7960 // Check whether the sign bit is set. 7961 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 7962 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 7963 ISD::SETLT); 7964 7965 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 7966 SDValue FudgePtr = DAG.getConstantPool( 7967 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 7968 getPointerTy()); 7969 7970 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 7971 SDValue Zero = DAG.getIntPtrConstant(0); 7972 SDValue Four = DAG.getIntPtrConstant(4); 7973 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 7974 Zero, Four); 7975 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 7976 7977 // Load the value out, extending it from f32 to f80. 7978 // FIXME: Avoid the extend by constructing the right constant pool? 7979 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 7980 FudgePtr, MachinePointerInfo::getConstantPool(), 7981 MVT::f32, false, false, 4); 7982 // Extend everything to 80 bits to force it to be done on x87. 7983 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 7984 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 7985} 7986 7987std::pair<SDValue,SDValue> X86TargetLowering:: 7988FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { 7989 DebugLoc DL = Op.getDebugLoc(); 7990 7991 EVT DstTy = Op.getValueType(); 7992 7993 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { 7994 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 7995 DstTy = MVT::i64; 7996 } 7997 7998 assert(DstTy.getSimpleVT() <= MVT::i64 && 7999 DstTy.getSimpleVT() >= MVT::i16 && 8000 "Unknown FP_TO_INT to lower!"); 8001 8002 // These are really Legal. 8003 if (DstTy == MVT::i32 && 8004 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8005 return std::make_pair(SDValue(), SDValue()); 8006 if (Subtarget->is64Bit() && 8007 DstTy == MVT::i64 && 8008 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 8009 return std::make_pair(SDValue(), SDValue()); 8010 8011 // We lower FP->int64 either into FISTP64 followed by a load from a temporary 8012 // stack slot, or into the FTOL runtime function. 8013 MachineFunction &MF = DAG.getMachineFunction(); 8014 unsigned MemSize = DstTy.getSizeInBits()/8; 8015 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8016 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8017 8018 unsigned Opc; 8019 if (!IsSigned && isIntegerTypeFTOL(DstTy)) 8020 Opc = X86ISD::WIN_FTOL; 8021 else 8022 switch (DstTy.getSimpleVT().SimpleTy) { 8023 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 8024 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 8025 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 8026 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 8027 } 8028 8029 SDValue Chain = DAG.getEntryNode(); 8030 SDValue Value = Op.getOperand(0); 8031 EVT TheVT = Op.getOperand(0).getValueType(); 8032 // FIXME This causes a redundant load/store if the SSE-class value is already 8033 // in memory, such as if it is on the callstack. 8034 if (isScalarFPTypeInSSEReg(TheVT)) { 8035 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 8036 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 8037 MachinePointerInfo::getFixedStack(SSFI), 8038 false, false, 0); 8039 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 8040 SDValue Ops[] = { 8041 Chain, StackSlot, DAG.getValueType(TheVT) 8042 }; 8043 8044 MachineMemOperand *MMO = 8045 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8046 MachineMemOperand::MOLoad, MemSize, MemSize); 8047 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 8048 DstTy, MMO); 8049 Chain = Value.getValue(1); 8050 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 8051 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8052 } 8053 8054 MachineMemOperand *MMO = 8055 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8056 MachineMemOperand::MOStore, MemSize, MemSize); 8057 8058 if (Opc != X86ISD::WIN_FTOL) { 8059 // Build the FP_TO_INT*_IN_MEM 8060 SDValue Ops[] = { Chain, Value, StackSlot }; 8061 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 8062 Ops, 3, DstTy, MMO); 8063 return std::make_pair(FIST, StackSlot); 8064 } else { 8065 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, 8066 DAG.getVTList(MVT::Other, MVT::Glue), 8067 Chain, Value); 8068 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, 8069 MVT::i32, ftol.getValue(1)); 8070 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, 8071 MVT::i32, eax.getValue(2)); 8072 SDValue Ops[] = { eax, edx }; 8073 SDValue pair = IsReplace 8074 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2) 8075 : DAG.getMergeValues(Ops, 2, DL); 8076 return std::make_pair(pair, SDValue()); 8077 } 8078} 8079 8080SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 8081 SelectionDAG &DAG) const { 8082 if (Op.getValueType().isVector()) 8083 return SDValue(); 8084 8085 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 8086 /*IsSigned=*/ true, /*IsReplace=*/ false); 8087 SDValue FIST = Vals.first, StackSlot = Vals.second; 8088 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 8089 if (FIST.getNode() == 0) return Op; 8090 8091 if (StackSlot.getNode()) 8092 // Load the result. 8093 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 8094 FIST, StackSlot, MachinePointerInfo(), 8095 false, false, false, 0); 8096 8097 // The node is the result. 8098 return FIST; 8099} 8100 8101SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 8102 SelectionDAG &DAG) const { 8103 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 8104 /*IsSigned=*/ false, /*IsReplace=*/ false); 8105 SDValue FIST = Vals.first, StackSlot = Vals.second; 8106 assert(FIST.getNode() && "Unexpected failure"); 8107 8108 if (StackSlot.getNode()) 8109 // Load the result. 8110 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 8111 FIST, StackSlot, MachinePointerInfo(), 8112 false, false, false, 0); 8113 8114 // The node is the result. 8115 return FIST; 8116} 8117 8118SDValue X86TargetLowering::LowerFABS(SDValue Op, 8119 SelectionDAG &DAG) const { 8120 LLVMContext *Context = DAG.getContext(); 8121 DebugLoc dl = Op.getDebugLoc(); 8122 EVT VT = Op.getValueType(); 8123 EVT EltVT = VT; 8124 if (VT.isVector()) 8125 EltVT = VT.getVectorElementType(); 8126 Constant *C; 8127 if (EltVT == MVT::f64) { 8128 C = ConstantVector::getSplat(2, 8129 ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 8130 } else { 8131 C = ConstantVector::getSplat(4, 8132 ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 8133 } 8134 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8135 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8136 MachinePointerInfo::getConstantPool(), 8137 false, false, false, 16); 8138 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 8139} 8140 8141SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 8142 LLVMContext *Context = DAG.getContext(); 8143 DebugLoc dl = Op.getDebugLoc(); 8144 EVT VT = Op.getValueType(); 8145 EVT EltVT = VT; 8146 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 8147 if (VT.isVector()) { 8148 EltVT = VT.getVectorElementType(); 8149 NumElts = VT.getVectorNumElements(); 8150 } 8151 Constant *C; 8152 if (EltVT == MVT::f64) 8153 C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 8154 else 8155 C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 8156 C = ConstantVector::getSplat(NumElts, C); 8157 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8158 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8159 MachinePointerInfo::getConstantPool(), 8160 false, false, false, 16); 8161 if (VT.isVector()) { 8162 MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 8163 return DAG.getNode(ISD::BITCAST, dl, VT, 8164 DAG.getNode(ISD::XOR, dl, XORVT, 8165 DAG.getNode(ISD::BITCAST, dl, XORVT, 8166 Op.getOperand(0)), 8167 DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); 8168 } 8169 8170 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 8171} 8172 8173SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 8174 LLVMContext *Context = DAG.getContext(); 8175 SDValue Op0 = Op.getOperand(0); 8176 SDValue Op1 = Op.getOperand(1); 8177 DebugLoc dl = Op.getDebugLoc(); 8178 EVT VT = Op.getValueType(); 8179 EVT SrcVT = Op1.getValueType(); 8180 8181 // If second operand is smaller, extend it first. 8182 if (SrcVT.bitsLT(VT)) { 8183 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 8184 SrcVT = VT; 8185 } 8186 // And if it is bigger, shrink it first. 8187 if (SrcVT.bitsGT(VT)) { 8188 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 8189 SrcVT = VT; 8190 } 8191 8192 // At this point the operands and the result should have the same 8193 // type, and that won't be f80 since that is not custom lowered. 8194 8195 // First get the sign bit of second operand. 8196 SmallVector<Constant*,4> CV; 8197 if (SrcVT == MVT::f64) { 8198 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 8199 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 8200 } else { 8201 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 8202 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8203 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8204 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8205 } 8206 Constant *C = ConstantVector::get(CV); 8207 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8208 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 8209 MachinePointerInfo::getConstantPool(), 8210 false, false, false, 16); 8211 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 8212 8213 // Shift sign bit right or left if the two operands have different types. 8214 if (SrcVT.bitsGT(VT)) { 8215 // Op0 is MVT::f32, Op1 is MVT::f64. 8216 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 8217 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 8218 DAG.getConstant(32, MVT::i32)); 8219 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 8220 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 8221 DAG.getIntPtrConstant(0)); 8222 } 8223 8224 // Clear first operand sign bit. 8225 CV.clear(); 8226 if (VT == MVT::f64) { 8227 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 8228 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 8229 } else { 8230 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 8231 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8232 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8233 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8234 } 8235 C = ConstantVector::get(CV); 8236 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8237 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8238 MachinePointerInfo::getConstantPool(), 8239 false, false, false, 16); 8240 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 8241 8242 // Or the value with the sign bit. 8243 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 8244} 8245 8246SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { 8247 SDValue N0 = Op.getOperand(0); 8248 DebugLoc dl = Op.getDebugLoc(); 8249 EVT VT = Op.getValueType(); 8250 8251 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 8252 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 8253 DAG.getConstant(1, VT)); 8254 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 8255} 8256 8257/// Emit nodes that will be selected as "test Op0,Op0", or something 8258/// equivalent. 8259SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 8260 SelectionDAG &DAG) const { 8261 DebugLoc dl = Op.getDebugLoc(); 8262 8263 // CF and OF aren't always set the way we want. Determine which 8264 // of these we need. 8265 bool NeedCF = false; 8266 bool NeedOF = false; 8267 switch (X86CC) { 8268 default: break; 8269 case X86::COND_A: case X86::COND_AE: 8270 case X86::COND_B: case X86::COND_BE: 8271 NeedCF = true; 8272 break; 8273 case X86::COND_G: case X86::COND_GE: 8274 case X86::COND_L: case X86::COND_LE: 8275 case X86::COND_O: case X86::COND_NO: 8276 NeedOF = true; 8277 break; 8278 } 8279 8280 // See if we can use the EFLAGS value from the operand instead of 8281 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 8282 // we prove that the arithmetic won't overflow, we can't use OF or CF. 8283 if (Op.getResNo() != 0 || NeedOF || NeedCF) 8284 // Emit a CMP with 0, which is the TEST pattern. 8285 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8286 DAG.getConstant(0, Op.getValueType())); 8287 8288 unsigned Opcode = 0; 8289 unsigned NumOperands = 0; 8290 8291 // Truncate operations may prevent the merge of the SETCC instruction 8292 // and the arithmetic intruction before it. Attempt to truncate the operands 8293 // of the arithmetic instruction and use a reduced bit-width instruction. 8294 bool NeedTruncation = false; 8295 SDValue ArithOp = Op; 8296 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { 8297 SDValue Arith = Op->getOperand(0); 8298 // Both the trunc and the arithmetic op need to have one user each. 8299 if (Arith->hasOneUse()) 8300 switch (Arith.getOpcode()) { 8301 default: break; 8302 case ISD::ADD: 8303 case ISD::SUB: 8304 case ISD::AND: 8305 case ISD::OR: 8306 case ISD::XOR: { 8307 NeedTruncation = true; 8308 ArithOp = Arith; 8309 } 8310 } 8311 } 8312 8313 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation 8314 // which may be the result of a CAST. We use the variable 'Op', which is the 8315 // non-casted variable when we check for possible users. 8316 switch (ArithOp.getOpcode()) { 8317 case ISD::ADD: 8318 // Due to an isel shortcoming, be conservative if this add is likely to be 8319 // selected as part of a load-modify-store instruction. When the root node 8320 // in a match is a store, isel doesn't know how to remap non-chain non-flag 8321 // uses of other nodes in the match, such as the ADD in this case. This 8322 // leads to the ADD being left around and reselected, with the result being 8323 // two adds in the output. Alas, even if none our users are stores, that 8324 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 8325 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 8326 // climbing the DAG back to the root, and it doesn't seem to be worth the 8327 // effort. 8328 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8329 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8330 if (UI->getOpcode() != ISD::CopyToReg && 8331 UI->getOpcode() != ISD::SETCC && 8332 UI->getOpcode() != ISD::STORE) 8333 goto default_case; 8334 8335 if (ConstantSDNode *C = 8336 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { 8337 // An add of one will be selected as an INC. 8338 if (C->getAPIntValue() == 1) { 8339 Opcode = X86ISD::INC; 8340 NumOperands = 1; 8341 break; 8342 } 8343 8344 // An add of negative one (subtract of one) will be selected as a DEC. 8345 if (C->getAPIntValue().isAllOnesValue()) { 8346 Opcode = X86ISD::DEC; 8347 NumOperands = 1; 8348 break; 8349 } 8350 } 8351 8352 // Otherwise use a regular EFLAGS-setting add. 8353 Opcode = X86ISD::ADD; 8354 NumOperands = 2; 8355 break; 8356 case ISD::AND: { 8357 // If the primary and result isn't used, don't bother using X86ISD::AND, 8358 // because a TEST instruction will be better. 8359 bool NonFlagUse = false; 8360 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8361 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 8362 SDNode *User = *UI; 8363 unsigned UOpNo = UI.getOperandNo(); 8364 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 8365 // Look pass truncate. 8366 UOpNo = User->use_begin().getOperandNo(); 8367 User = *User->use_begin(); 8368 } 8369 8370 if (User->getOpcode() != ISD::BRCOND && 8371 User->getOpcode() != ISD::SETCC && 8372 !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { 8373 NonFlagUse = true; 8374 break; 8375 } 8376 } 8377 8378 if (!NonFlagUse) 8379 break; 8380 } 8381 // FALL THROUGH 8382 case ISD::SUB: 8383 case ISD::OR: 8384 case ISD::XOR: 8385 // Due to the ISEL shortcoming noted above, be conservative if this op is 8386 // likely to be selected as part of a load-modify-store instruction. 8387 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8388 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8389 if (UI->getOpcode() == ISD::STORE) 8390 goto default_case; 8391 8392 // Otherwise use a regular EFLAGS-setting instruction. 8393 switch (ArithOp.getOpcode()) { 8394 default: llvm_unreachable("unexpected operator!"); 8395 case ISD::SUB: Opcode = X86ISD::SUB; break; 8396 case ISD::OR: Opcode = X86ISD::OR; break; 8397 case ISD::XOR: Opcode = X86ISD::XOR; break; 8398 case ISD::AND: Opcode = X86ISD::AND; break; 8399 } 8400 8401 NumOperands = 2; 8402 break; 8403 case X86ISD::ADD: 8404 case X86ISD::SUB: 8405 case X86ISD::INC: 8406 case X86ISD::DEC: 8407 case X86ISD::OR: 8408 case X86ISD::XOR: 8409 case X86ISD::AND: 8410 return SDValue(Op.getNode(), 1); 8411 default: 8412 default_case: 8413 break; 8414 } 8415 8416 // If we found that truncation is beneficial, perform the truncation and 8417 // update 'Op'. 8418 if (NeedTruncation) { 8419 EVT VT = Op.getValueType(); 8420 SDValue WideVal = Op->getOperand(0); 8421 EVT WideVT = WideVal.getValueType(); 8422 unsigned ConvertedOp = 0; 8423 // Use a target machine opcode to prevent further DAGCombine 8424 // optimizations that may separate the arithmetic operations 8425 // from the setcc node. 8426 switch (WideVal.getOpcode()) { 8427 default: break; 8428 case ISD::ADD: ConvertedOp = X86ISD::ADD; break; 8429 case ISD::SUB: ConvertedOp = X86ISD::SUB; break; 8430 case ISD::AND: ConvertedOp = X86ISD::AND; break; 8431 case ISD::OR: ConvertedOp = X86ISD::OR; break; 8432 case ISD::XOR: ConvertedOp = X86ISD::XOR; break; 8433 } 8434 8435 if (ConvertedOp) { 8436 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8437 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { 8438 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); 8439 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); 8440 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); 8441 } 8442 } 8443 } 8444 8445 if (Opcode == 0) 8446 // Emit a CMP with 0, which is the TEST pattern. 8447 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8448 DAG.getConstant(0, Op.getValueType())); 8449 8450 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 8451 SmallVector<SDValue, 4> Ops; 8452 for (unsigned i = 0; i != NumOperands; ++i) 8453 Ops.push_back(Op.getOperand(i)); 8454 8455 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 8456 DAG.ReplaceAllUsesWith(Op, New); 8457 return SDValue(New.getNode(), 1); 8458} 8459 8460/// Emit nodes that will be selected as "cmp Op0,Op1", or something 8461/// equivalent. 8462SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 8463 SelectionDAG &DAG) const { 8464 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 8465 if (C->getAPIntValue() == 0) 8466 return EmitTest(Op0, X86CC, DAG); 8467 8468 DebugLoc dl = Op0.getDebugLoc(); 8469 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || 8470 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { 8471 // Use SUB instead of CMP to enable CSE between SUB and CMP. 8472 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); 8473 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, 8474 Op0, Op1); 8475 return SDValue(Sub.getNode(), 1); 8476 } 8477 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 8478} 8479 8480/// Convert a comparison if required by the subtarget. 8481SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, 8482 SelectionDAG &DAG) const { 8483 // If the subtarget does not support the FUCOMI instruction, floating-point 8484 // comparisons have to be converted. 8485 if (Subtarget->hasCMov() || 8486 Cmp.getOpcode() != X86ISD::CMP || 8487 !Cmp.getOperand(0).getValueType().isFloatingPoint() || 8488 !Cmp.getOperand(1).getValueType().isFloatingPoint()) 8489 return Cmp; 8490 8491 // The instruction selector will select an FUCOM instruction instead of 8492 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence 8493 // build an SDNode sequence that transfers the result from FPSW into EFLAGS: 8494 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) 8495 DebugLoc dl = Cmp.getDebugLoc(); 8496 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); 8497 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); 8498 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, 8499 DAG.getConstant(8, MVT::i8)); 8500 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); 8501 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); 8502} 8503 8504/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 8505/// if it's possible. 8506SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 8507 DebugLoc dl, SelectionDAG &DAG) const { 8508 SDValue Op0 = And.getOperand(0); 8509 SDValue Op1 = And.getOperand(1); 8510 if (Op0.getOpcode() == ISD::TRUNCATE) 8511 Op0 = Op0.getOperand(0); 8512 if (Op1.getOpcode() == ISD::TRUNCATE) 8513 Op1 = Op1.getOperand(0); 8514 8515 SDValue LHS, RHS; 8516 if (Op1.getOpcode() == ISD::SHL) 8517 std::swap(Op0, Op1); 8518 if (Op0.getOpcode() == ISD::SHL) { 8519 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 8520 if (And00C->getZExtValue() == 1) { 8521 // If we looked past a truncate, check that it's only truncating away 8522 // known zeros. 8523 unsigned BitWidth = Op0.getValueSizeInBits(); 8524 unsigned AndBitWidth = And.getValueSizeInBits(); 8525 if (BitWidth > AndBitWidth) { 8526 APInt Zeros, Ones; 8527 DAG.ComputeMaskedBits(Op0, Zeros, Ones); 8528 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 8529 return SDValue(); 8530 } 8531 LHS = Op1; 8532 RHS = Op0.getOperand(1); 8533 } 8534 } else if (Op1.getOpcode() == ISD::Constant) { 8535 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 8536 uint64_t AndRHSVal = AndRHS->getZExtValue(); 8537 SDValue AndLHS = Op0; 8538 8539 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 8540 LHS = AndLHS.getOperand(0); 8541 RHS = AndLHS.getOperand(1); 8542 } 8543 8544 // Use BT if the immediate can't be encoded in a TEST instruction. 8545 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 8546 LHS = AndLHS; 8547 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); 8548 } 8549 } 8550 8551 if (LHS.getNode()) { 8552 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 8553 // instruction. Since the shift amount is in-range-or-undefined, we know 8554 // that doing a bittest on the i32 value is ok. We extend to i32 because 8555 // the encoding for the i16 version is larger than the i32 version. 8556 // Also promote i16 to i32 for performance / code size reason. 8557 if (LHS.getValueType() == MVT::i8 || 8558 LHS.getValueType() == MVT::i16) 8559 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 8560 8561 // If the operand types disagree, extend the shift amount to match. Since 8562 // BT ignores high bits (like shifts) we can use anyextend. 8563 if (LHS.getValueType() != RHS.getValueType()) 8564 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 8565 8566 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 8567 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 8568 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8569 DAG.getConstant(Cond, MVT::i8), BT); 8570 } 8571 8572 return SDValue(); 8573} 8574 8575SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 8576 8577 if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); 8578 8579 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 8580 SDValue Op0 = Op.getOperand(0); 8581 SDValue Op1 = Op.getOperand(1); 8582 DebugLoc dl = Op.getDebugLoc(); 8583 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 8584 8585 // Optimize to BT if possible. 8586 // Lower (X & (1 << N)) == 0 to BT(X, N). 8587 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 8588 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 8589 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 8590 Op1.getOpcode() == ISD::Constant && 8591 cast<ConstantSDNode>(Op1)->isNullValue() && 8592 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8593 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 8594 if (NewSetCC.getNode()) 8595 return NewSetCC; 8596 } 8597 8598 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 8599 // these. 8600 if (Op1.getOpcode() == ISD::Constant && 8601 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 8602 cast<ConstantSDNode>(Op1)->isNullValue()) && 8603 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8604 8605 // If the input is a setcc, then reuse the input setcc or use a new one with 8606 // the inverted condition. 8607 if (Op0.getOpcode() == X86ISD::SETCC) { 8608 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 8609 bool Invert = (CC == ISD::SETNE) ^ 8610 cast<ConstantSDNode>(Op1)->isNullValue(); 8611 if (!Invert) return Op0; 8612 8613 CCode = X86::GetOppositeBranchCondition(CCode); 8614 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8615 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 8616 } 8617 } 8618 8619 bool isFP = Op1.getValueType().isFloatingPoint(); 8620 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 8621 if (X86CC == X86::COND_INVALID) 8622 return SDValue(); 8623 8624 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 8625 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 8626 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8627 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 8628} 8629 8630// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 8631// ones, and then concatenate the result back. 8632static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 8633 EVT VT = Op.getValueType(); 8634 8635 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && 8636 "Unsupported value type for operation"); 8637 8638 unsigned NumElems = VT.getVectorNumElements(); 8639 DebugLoc dl = Op.getDebugLoc(); 8640 SDValue CC = Op.getOperand(2); 8641 8642 // Extract the LHS vectors 8643 SDValue LHS = Op.getOperand(0); 8644 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 8645 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 8646 8647 // Extract the RHS vectors 8648 SDValue RHS = Op.getOperand(1); 8649 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 8650 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 8651 8652 // Issue the operation on the smaller types and concatenate the result back 8653 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 8654 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 8655 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 8656 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 8657 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 8658} 8659 8660 8661SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 8662 SDValue Cond; 8663 SDValue Op0 = Op.getOperand(0); 8664 SDValue Op1 = Op.getOperand(1); 8665 SDValue CC = Op.getOperand(2); 8666 EVT VT = Op.getValueType(); 8667 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 8668 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 8669 DebugLoc dl = Op.getDebugLoc(); 8670 8671 if (isFP) { 8672#ifndef NDEBUG 8673 EVT EltVT = Op0.getValueType().getVectorElementType(); 8674 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 8675#endif 8676 8677 unsigned SSECC; 8678 bool Swap = false; 8679 8680 // SSE Condition code mapping: 8681 // 0 - EQ 8682 // 1 - LT 8683 // 2 - LE 8684 // 3 - UNORD 8685 // 4 - NEQ 8686 // 5 - NLT 8687 // 6 - NLE 8688 // 7 - ORD 8689 switch (SetCCOpcode) { 8690 default: llvm_unreachable("Unexpected SETCC condition"); 8691 case ISD::SETOEQ: 8692 case ISD::SETEQ: SSECC = 0; break; 8693 case ISD::SETOGT: 8694 case ISD::SETGT: Swap = true; // Fallthrough 8695 case ISD::SETLT: 8696 case ISD::SETOLT: SSECC = 1; break; 8697 case ISD::SETOGE: 8698 case ISD::SETGE: Swap = true; // Fallthrough 8699 case ISD::SETLE: 8700 case ISD::SETOLE: SSECC = 2; break; 8701 case ISD::SETUO: SSECC = 3; break; 8702 case ISD::SETUNE: 8703 case ISD::SETNE: SSECC = 4; break; 8704 case ISD::SETULE: Swap = true; // Fallthrough 8705 case ISD::SETUGE: SSECC = 5; break; 8706 case ISD::SETULT: Swap = true; // Fallthrough 8707 case ISD::SETUGT: SSECC = 6; break; 8708 case ISD::SETO: SSECC = 7; break; 8709 case ISD::SETUEQ: 8710 case ISD::SETONE: SSECC = 8; break; 8711 } 8712 if (Swap) 8713 std::swap(Op0, Op1); 8714 8715 // In the two special cases we can't handle, emit two comparisons. 8716 if (SSECC == 8) { 8717 unsigned CC0, CC1; 8718 unsigned CombineOpc; 8719 if (SetCCOpcode == ISD::SETUEQ) { 8720 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; 8721 } else { 8722 assert(SetCCOpcode == ISD::SETONE); 8723 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; 8724 } 8725 8726 SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8727 DAG.getConstant(CC0, MVT::i8)); 8728 SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8729 DAG.getConstant(CC1, MVT::i8)); 8730 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); 8731 } 8732 // Handle all other FP comparisons here. 8733 return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8734 DAG.getConstant(SSECC, MVT::i8)); 8735 } 8736 8737 // Break 256-bit integer vector compare into smaller ones. 8738 if (VT.is256BitVector() && !Subtarget->hasAVX2()) 8739 return Lower256IntVSETCC(Op, DAG); 8740 8741 // We are handling one of the integer comparisons here. Since SSE only has 8742 // GT and EQ comparisons for integer, swapping operands and multiple 8743 // operations may be required for some comparisons. 8744 unsigned Opc; 8745 bool Swap = false, Invert = false, FlipSigns = false; 8746 8747 switch (SetCCOpcode) { 8748 default: llvm_unreachable("Unexpected SETCC condition"); 8749 case ISD::SETNE: Invert = true; 8750 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; 8751 case ISD::SETLT: Swap = true; 8752 case ISD::SETGT: Opc = X86ISD::PCMPGT; break; 8753 case ISD::SETGE: Swap = true; 8754 case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; 8755 case ISD::SETULT: Swap = true; 8756 case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; 8757 case ISD::SETUGE: Swap = true; 8758 case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; 8759 } 8760 if (Swap) 8761 std::swap(Op0, Op1); 8762 8763 // Check that the operation in question is available (most are plain SSE2, 8764 // but PCMPGTQ and PCMPEQQ have different requirements). 8765 if (VT == MVT::v2i64) { 8766 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) 8767 return SDValue(); 8768 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) 8769 return SDValue(); 8770 } 8771 8772 // Since SSE has no unsigned integer comparisons, we need to flip the sign 8773 // bits of the inputs before performing those operations. 8774 if (FlipSigns) { 8775 EVT EltVT = VT.getVectorElementType(); 8776 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 8777 EltVT); 8778 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 8779 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 8780 SignBits.size()); 8781 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 8782 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 8783 } 8784 8785 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 8786 8787 // If the logical-not of the result is required, perform that now. 8788 if (Invert) 8789 Result = DAG.getNOT(dl, Result, VT); 8790 8791 return Result; 8792} 8793 8794// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 8795static bool isX86LogicalCmp(SDValue Op) { 8796 unsigned Opc = Op.getNode()->getOpcode(); 8797 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || 8798 Opc == X86ISD::SAHF) 8799 return true; 8800 if (Op.getResNo() == 1 && 8801 (Opc == X86ISD::ADD || 8802 Opc == X86ISD::SUB || 8803 Opc == X86ISD::ADC || 8804 Opc == X86ISD::SBB || 8805 Opc == X86ISD::SMUL || 8806 Opc == X86ISD::UMUL || 8807 Opc == X86ISD::INC || 8808 Opc == X86ISD::DEC || 8809 Opc == X86ISD::OR || 8810 Opc == X86ISD::XOR || 8811 Opc == X86ISD::AND)) 8812 return true; 8813 8814 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 8815 return true; 8816 8817 return false; 8818} 8819 8820static bool isZero(SDValue V) { 8821 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8822 return C && C->isNullValue(); 8823} 8824 8825static bool isAllOnes(SDValue V) { 8826 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8827 return C && C->isAllOnesValue(); 8828} 8829 8830static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { 8831 if (V.getOpcode() != ISD::TRUNCATE) 8832 return false; 8833 8834 SDValue VOp0 = V.getOperand(0); 8835 unsigned InBits = VOp0.getValueSizeInBits(); 8836 unsigned Bits = V.getValueSizeInBits(); 8837 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); 8838} 8839 8840SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 8841 bool addTest = true; 8842 SDValue Cond = Op.getOperand(0); 8843 SDValue Op1 = Op.getOperand(1); 8844 SDValue Op2 = Op.getOperand(2); 8845 DebugLoc DL = Op.getDebugLoc(); 8846 SDValue CC; 8847 8848 if (Cond.getOpcode() == ISD::SETCC) { 8849 SDValue NewCond = LowerSETCC(Cond, DAG); 8850 if (NewCond.getNode()) 8851 Cond = NewCond; 8852 } 8853 8854 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 8855 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 8856 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 8857 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 8858 if (Cond.getOpcode() == X86ISD::SETCC && 8859 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 8860 isZero(Cond.getOperand(1).getOperand(1))) { 8861 SDValue Cmp = Cond.getOperand(1); 8862 8863 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 8864 8865 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 8866 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 8867 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 8868 8869 SDValue CmpOp0 = Cmp.getOperand(0); 8870 // Apply further optimizations for special cases 8871 // (select (x != 0), -1, 0) -> neg & sbb 8872 // (select (x == 0), 0, -1) -> neg & sbb 8873 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) 8874 if (YC->isNullValue() && 8875 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { 8876 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); 8877 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 8878 DAG.getConstant(0, CmpOp0.getValueType()), 8879 CmpOp0); 8880 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8881 DAG.getConstant(X86::COND_B, MVT::i8), 8882 SDValue(Neg.getNode(), 1)); 8883 return Res; 8884 } 8885 8886 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 8887 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 8888 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 8889 8890 SDValue Res = // Res = 0 or -1. 8891 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8892 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 8893 8894 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 8895 Res = DAG.getNOT(DL, Res, Res.getValueType()); 8896 8897 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 8898 if (N2C == 0 || !N2C->isNullValue()) 8899 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 8900 return Res; 8901 } 8902 } 8903 8904 // Look past (and (setcc_carry (cmp ...)), 1). 8905 if (Cond.getOpcode() == ISD::AND && 8906 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8907 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8908 if (C && C->getAPIntValue() == 1) 8909 Cond = Cond.getOperand(0); 8910 } 8911 8912 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8913 // setting operand in place of the X86ISD::SETCC. 8914 unsigned CondOpcode = Cond.getOpcode(); 8915 if (CondOpcode == X86ISD::SETCC || 8916 CondOpcode == X86ISD::SETCC_CARRY) { 8917 CC = Cond.getOperand(0); 8918 8919 SDValue Cmp = Cond.getOperand(1); 8920 unsigned Opc = Cmp.getOpcode(); 8921 EVT VT = Op.getValueType(); 8922 8923 bool IllegalFPCMov = false; 8924 if (VT.isFloatingPoint() && !VT.isVector() && 8925 !isScalarFPTypeInSSEReg(VT)) // FPStack? 8926 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 8927 8928 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 8929 Opc == X86ISD::BT) { // FIXME 8930 Cond = Cmp; 8931 addTest = false; 8932 } 8933 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 8934 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 8935 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 8936 Cond.getOperand(0).getValueType() != MVT::i8)) { 8937 SDValue LHS = Cond.getOperand(0); 8938 SDValue RHS = Cond.getOperand(1); 8939 unsigned X86Opcode; 8940 unsigned X86Cond; 8941 SDVTList VTs; 8942 switch (CondOpcode) { 8943 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 8944 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 8945 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 8946 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 8947 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 8948 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 8949 default: llvm_unreachable("unexpected overflowing operator"); 8950 } 8951 if (CondOpcode == ISD::UMULO) 8952 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 8953 MVT::i32); 8954 else 8955 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 8956 8957 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 8958 8959 if (CondOpcode == ISD::UMULO) 8960 Cond = X86Op.getValue(2); 8961 else 8962 Cond = X86Op.getValue(1); 8963 8964 CC = DAG.getConstant(X86Cond, MVT::i8); 8965 addTest = false; 8966 } 8967 8968 if (addTest) { 8969 // Look pass the truncate if the high bits are known zero. 8970 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 8971 Cond = Cond.getOperand(0); 8972 8973 // We know the result of AND is compared against zero. Try to match 8974 // it to BT. 8975 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8976 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 8977 if (NewSetCC.getNode()) { 8978 CC = NewSetCC.getOperand(0); 8979 Cond = NewSetCC.getOperand(1); 8980 addTest = false; 8981 } 8982 } 8983 } 8984 8985 if (addTest) { 8986 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8987 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8988 } 8989 8990 // a < b ? -1 : 0 -> RES = ~setcc_carry 8991 // a < b ? 0 : -1 -> RES = setcc_carry 8992 // a >= b ? -1 : 0 -> RES = setcc_carry 8993 // a >= b ? 0 : -1 -> RES = ~setcc_carry 8994 if (Cond.getOpcode() == X86ISD::SUB) { 8995 Cond = ConvertCmpIfNecessary(Cond, DAG); 8996 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 8997 8998 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 8999 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 9000 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 9001 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 9002 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 9003 return DAG.getNOT(DL, Res, Res.getValueType()); 9004 return Res; 9005 } 9006 } 9007 9008 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 9009 // condition is true. 9010 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 9011 SDValue Ops[] = { Op2, Op1, CC, Cond }; 9012 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 9013} 9014 9015// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 9016// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 9017// from the AND / OR. 9018static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 9019 Opc = Op.getOpcode(); 9020 if (Opc != ISD::OR && Opc != ISD::AND) 9021 return false; 9022 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 9023 Op.getOperand(0).hasOneUse() && 9024 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 9025 Op.getOperand(1).hasOneUse()); 9026} 9027 9028// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 9029// 1 and that the SETCC node has a single use. 9030static bool isXor1OfSetCC(SDValue Op) { 9031 if (Op.getOpcode() != ISD::XOR) 9032 return false; 9033 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 9034 if (N1C && N1C->getAPIntValue() == 1) { 9035 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 9036 Op.getOperand(0).hasOneUse(); 9037 } 9038 return false; 9039} 9040 9041SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 9042 bool addTest = true; 9043 SDValue Chain = Op.getOperand(0); 9044 SDValue Cond = Op.getOperand(1); 9045 SDValue Dest = Op.getOperand(2); 9046 DebugLoc dl = Op.getDebugLoc(); 9047 SDValue CC; 9048 bool Inverted = false; 9049 9050 if (Cond.getOpcode() == ISD::SETCC) { 9051 // Check for setcc([su]{add,sub,mul}o == 0). 9052 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 9053 isa<ConstantSDNode>(Cond.getOperand(1)) && 9054 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && 9055 Cond.getOperand(0).getResNo() == 1 && 9056 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 9057 Cond.getOperand(0).getOpcode() == ISD::UADDO || 9058 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 9059 Cond.getOperand(0).getOpcode() == ISD::USUBO || 9060 Cond.getOperand(0).getOpcode() == ISD::SMULO || 9061 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 9062 Inverted = true; 9063 Cond = Cond.getOperand(0); 9064 } else { 9065 SDValue NewCond = LowerSETCC(Cond, DAG); 9066 if (NewCond.getNode()) 9067 Cond = NewCond; 9068 } 9069 } 9070#if 0 9071 // FIXME: LowerXALUO doesn't handle these!! 9072 else if (Cond.getOpcode() == X86ISD::ADD || 9073 Cond.getOpcode() == X86ISD::SUB || 9074 Cond.getOpcode() == X86ISD::SMUL || 9075 Cond.getOpcode() == X86ISD::UMUL) 9076 Cond = LowerXALUO(Cond, DAG); 9077#endif 9078 9079 // Look pass (and (setcc_carry (cmp ...)), 1). 9080 if (Cond.getOpcode() == ISD::AND && 9081 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 9082 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 9083 if (C && C->getAPIntValue() == 1) 9084 Cond = Cond.getOperand(0); 9085 } 9086 9087 // If condition flag is set by a X86ISD::CMP, then use it as the condition 9088 // setting operand in place of the X86ISD::SETCC. 9089 unsigned CondOpcode = Cond.getOpcode(); 9090 if (CondOpcode == X86ISD::SETCC || 9091 CondOpcode == X86ISD::SETCC_CARRY) { 9092 CC = Cond.getOperand(0); 9093 9094 SDValue Cmp = Cond.getOperand(1); 9095 unsigned Opc = Cmp.getOpcode(); 9096 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 9097 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 9098 Cond = Cmp; 9099 addTest = false; 9100 } else { 9101 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 9102 default: break; 9103 case X86::COND_O: 9104 case X86::COND_B: 9105 // These can only come from an arithmetic instruction with overflow, 9106 // e.g. SADDO, UADDO. 9107 Cond = Cond.getNode()->getOperand(1); 9108 addTest = false; 9109 break; 9110 } 9111 } 9112 } 9113 CondOpcode = Cond.getOpcode(); 9114 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 9115 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 9116 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 9117 Cond.getOperand(0).getValueType() != MVT::i8)) { 9118 SDValue LHS = Cond.getOperand(0); 9119 SDValue RHS = Cond.getOperand(1); 9120 unsigned X86Opcode; 9121 unsigned X86Cond; 9122 SDVTList VTs; 9123 switch (CondOpcode) { 9124 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 9125 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 9126 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 9127 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 9128 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 9129 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 9130 default: llvm_unreachable("unexpected overflowing operator"); 9131 } 9132 if (Inverted) 9133 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 9134 if (CondOpcode == ISD::UMULO) 9135 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 9136 MVT::i32); 9137 else 9138 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 9139 9140 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 9141 9142 if (CondOpcode == ISD::UMULO) 9143 Cond = X86Op.getValue(2); 9144 else 9145 Cond = X86Op.getValue(1); 9146 9147 CC = DAG.getConstant(X86Cond, MVT::i8); 9148 addTest = false; 9149 } else { 9150 unsigned CondOpc; 9151 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 9152 SDValue Cmp = Cond.getOperand(0).getOperand(1); 9153 if (CondOpc == ISD::OR) { 9154 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 9155 // two branches instead of an explicit OR instruction with a 9156 // separate test. 9157 if (Cmp == Cond.getOperand(1).getOperand(1) && 9158 isX86LogicalCmp(Cmp)) { 9159 CC = Cond.getOperand(0).getOperand(0); 9160 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9161 Chain, Dest, CC, Cmp); 9162 CC = Cond.getOperand(1).getOperand(0); 9163 Cond = Cmp; 9164 addTest = false; 9165 } 9166 } else { // ISD::AND 9167 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 9168 // two branches instead of an explicit AND instruction with a 9169 // separate test. However, we only do this if this block doesn't 9170 // have a fall-through edge, because this requires an explicit 9171 // jmp when the condition is false. 9172 if (Cmp == Cond.getOperand(1).getOperand(1) && 9173 isX86LogicalCmp(Cmp) && 9174 Op.getNode()->hasOneUse()) { 9175 X86::CondCode CCode = 9176 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 9177 CCode = X86::GetOppositeBranchCondition(CCode); 9178 CC = DAG.getConstant(CCode, MVT::i8); 9179 SDNode *User = *Op.getNode()->use_begin(); 9180 // Look for an unconditional branch following this conditional branch. 9181 // We need this because we need to reverse the successors in order 9182 // to implement FCMP_OEQ. 9183 if (User->getOpcode() == ISD::BR) { 9184 SDValue FalseBB = User->getOperand(1); 9185 SDNode *NewBR = 9186 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 9187 assert(NewBR == User); 9188 (void)NewBR; 9189 Dest = FalseBB; 9190 9191 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9192 Chain, Dest, CC, Cmp); 9193 X86::CondCode CCode = 9194 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 9195 CCode = X86::GetOppositeBranchCondition(CCode); 9196 CC = DAG.getConstant(CCode, MVT::i8); 9197 Cond = Cmp; 9198 addTest = false; 9199 } 9200 } 9201 } 9202 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 9203 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 9204 // It should be transformed during dag combiner except when the condition 9205 // is set by a arithmetics with overflow node. 9206 X86::CondCode CCode = 9207 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 9208 CCode = X86::GetOppositeBranchCondition(CCode); 9209 CC = DAG.getConstant(CCode, MVT::i8); 9210 Cond = Cond.getOperand(0).getOperand(1); 9211 addTest = false; 9212 } else if (Cond.getOpcode() == ISD::SETCC && 9213 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 9214 // For FCMP_OEQ, we can emit 9215 // two branches instead of an explicit AND instruction with a 9216 // separate test. However, we only do this if this block doesn't 9217 // have a fall-through edge, because this requires an explicit 9218 // jmp when the condition is false. 9219 if (Op.getNode()->hasOneUse()) { 9220 SDNode *User = *Op.getNode()->use_begin(); 9221 // Look for an unconditional branch following this conditional branch. 9222 // We need this because we need to reverse the successors in order 9223 // to implement FCMP_OEQ. 9224 if (User->getOpcode() == ISD::BR) { 9225 SDValue FalseBB = User->getOperand(1); 9226 SDNode *NewBR = 9227 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 9228 assert(NewBR == User); 9229 (void)NewBR; 9230 Dest = FalseBB; 9231 9232 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 9233 Cond.getOperand(0), Cond.getOperand(1)); 9234 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 9235 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9236 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9237 Chain, Dest, CC, Cmp); 9238 CC = DAG.getConstant(X86::COND_P, MVT::i8); 9239 Cond = Cmp; 9240 addTest = false; 9241 } 9242 } 9243 } else if (Cond.getOpcode() == ISD::SETCC && 9244 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 9245 // For FCMP_UNE, we can emit 9246 // two branches instead of an explicit AND instruction with a 9247 // separate test. However, we only do this if this block doesn't 9248 // have a fall-through edge, because this requires an explicit 9249 // jmp when the condition is false. 9250 if (Op.getNode()->hasOneUse()) { 9251 SDNode *User = *Op.getNode()->use_begin(); 9252 // Look for an unconditional branch following this conditional branch. 9253 // We need this because we need to reverse the successors in order 9254 // to implement FCMP_UNE. 9255 if (User->getOpcode() == ISD::BR) { 9256 SDValue FalseBB = User->getOperand(1); 9257 SDNode *NewBR = 9258 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 9259 assert(NewBR == User); 9260 (void)NewBR; 9261 9262 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 9263 Cond.getOperand(0), Cond.getOperand(1)); 9264 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 9265 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9266 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9267 Chain, Dest, CC, Cmp); 9268 CC = DAG.getConstant(X86::COND_NP, MVT::i8); 9269 Cond = Cmp; 9270 addTest = false; 9271 Dest = FalseBB; 9272 } 9273 } 9274 } 9275 } 9276 9277 if (addTest) { 9278 // Look pass the truncate if the high bits are known zero. 9279 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 9280 Cond = Cond.getOperand(0); 9281 9282 // We know the result of AND is compared against zero. Try to match 9283 // it to BT. 9284 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 9285 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 9286 if (NewSetCC.getNode()) { 9287 CC = NewSetCC.getOperand(0); 9288 Cond = NewSetCC.getOperand(1); 9289 addTest = false; 9290 } 9291 } 9292 } 9293 9294 if (addTest) { 9295 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 9296 Cond = EmitTest(Cond, X86::COND_NE, DAG); 9297 } 9298 Cond = ConvertCmpIfNecessary(Cond, DAG); 9299 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 9300 Chain, Dest, CC, Cond); 9301} 9302 9303 9304// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 9305// Calls to _alloca is needed to probe the stack when allocating more than 4k 9306// bytes in one go. Touching the stack at 4K increments is necessary to ensure 9307// that the guard pages used by the OS virtual memory manager are allocated in 9308// correct sequence. 9309SDValue 9310X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 9311 SelectionDAG &DAG) const { 9312 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || 9313 getTargetMachine().Options.EnableSegmentedStacks) && 9314 "This should be used only on Windows targets or when segmented stacks " 9315 "are being used"); 9316 assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); 9317 DebugLoc dl = Op.getDebugLoc(); 9318 9319 // Get the inputs. 9320 SDValue Chain = Op.getOperand(0); 9321 SDValue Size = Op.getOperand(1); 9322 // FIXME: Ensure alignment here 9323 9324 bool Is64Bit = Subtarget->is64Bit(); 9325 EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; 9326 9327 if (getTargetMachine().Options.EnableSegmentedStacks) { 9328 MachineFunction &MF = DAG.getMachineFunction(); 9329 MachineRegisterInfo &MRI = MF.getRegInfo(); 9330 9331 if (Is64Bit) { 9332 // The 64 bit implementation of segmented stacks needs to clobber both r10 9333 // r11. This makes it impossible to use it along with nested parameters. 9334 const Function *F = MF.getFunction(); 9335 9336 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 9337 I != E; ++I) 9338 if (I->hasNestAttr()) 9339 report_fatal_error("Cannot use segmented stacks with functions that " 9340 "have nested arguments."); 9341 } 9342 9343 const TargetRegisterClass *AddrRegClass = 9344 getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); 9345 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 9346 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 9347 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 9348 DAG.getRegister(Vreg, SPTy)); 9349 SDValue Ops1[2] = { Value, Chain }; 9350 return DAG.getMergeValues(Ops1, 2, dl); 9351 } else { 9352 SDValue Flag; 9353 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 9354 9355 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 9356 Flag = Chain.getValue(1); 9357 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 9358 9359 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 9360 Flag = Chain.getValue(1); 9361 9362 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 9363 9364 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 9365 return DAG.getMergeValues(Ops1, 2, dl); 9366 } 9367} 9368 9369SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 9370 MachineFunction &MF = DAG.getMachineFunction(); 9371 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 9372 9373 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9374 DebugLoc DL = Op.getDebugLoc(); 9375 9376 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 9377 // vastart just stores the address of the VarArgsFrameIndex slot into the 9378 // memory location argument. 9379 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 9380 getPointerTy()); 9381 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 9382 MachinePointerInfo(SV), false, false, 0); 9383 } 9384 9385 // __va_list_tag: 9386 // gp_offset (0 - 6 * 8) 9387 // fp_offset (48 - 48 + 8 * 16) 9388 // overflow_arg_area (point to parameters coming in memory). 9389 // reg_save_area 9390 SmallVector<SDValue, 8> MemOps; 9391 SDValue FIN = Op.getOperand(1); 9392 // Store gp_offset 9393 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 9394 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 9395 MVT::i32), 9396 FIN, MachinePointerInfo(SV), false, false, 0); 9397 MemOps.push_back(Store); 9398 9399 // Store fp_offset 9400 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9401 FIN, DAG.getIntPtrConstant(4)); 9402 Store = DAG.getStore(Op.getOperand(0), DL, 9403 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 9404 MVT::i32), 9405 FIN, MachinePointerInfo(SV, 4), false, false, 0); 9406 MemOps.push_back(Store); 9407 9408 // Store ptr to overflow_arg_area 9409 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9410 FIN, DAG.getIntPtrConstant(4)); 9411 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 9412 getPointerTy()); 9413 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 9414 MachinePointerInfo(SV, 8), 9415 false, false, 0); 9416 MemOps.push_back(Store); 9417 9418 // Store ptr to reg_save_area. 9419 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9420 FIN, DAG.getIntPtrConstant(8)); 9421 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 9422 getPointerTy()); 9423 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 9424 MachinePointerInfo(SV, 16), false, false, 0); 9425 MemOps.push_back(Store); 9426 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 9427 &MemOps[0], MemOps.size()); 9428} 9429 9430SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 9431 assert(Subtarget->is64Bit() && 9432 "LowerVAARG only handles 64-bit va_arg!"); 9433 assert((Subtarget->isTargetLinux() || 9434 Subtarget->isTargetDarwin()) && 9435 "Unhandled target in LowerVAARG"); 9436 assert(Op.getNode()->getNumOperands() == 4); 9437 SDValue Chain = Op.getOperand(0); 9438 SDValue SrcPtr = Op.getOperand(1); 9439 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9440 unsigned Align = Op.getConstantOperandVal(3); 9441 DebugLoc dl = Op.getDebugLoc(); 9442 9443 EVT ArgVT = Op.getNode()->getValueType(0); 9444 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 9445 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 9446 uint8_t ArgMode; 9447 9448 // Decide which area this value should be read from. 9449 // TODO: Implement the AMD64 ABI in its entirety. This simple 9450 // selection mechanism works only for the basic types. 9451 if (ArgVT == MVT::f80) { 9452 llvm_unreachable("va_arg for f80 not yet implemented"); 9453 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 9454 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 9455 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 9456 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 9457 } else { 9458 llvm_unreachable("Unhandled argument type in LowerVAARG"); 9459 } 9460 9461 if (ArgMode == 2) { 9462 // Sanity Check: Make sure using fp_offset makes sense. 9463 assert(!getTargetMachine().Options.UseSoftFloat && 9464 !(DAG.getMachineFunction() 9465 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 9466 Subtarget->hasSSE1()); 9467 } 9468 9469 // Insert VAARG_64 node into the DAG 9470 // VAARG_64 returns two values: Variable Argument Address, Chain 9471 SmallVector<SDValue, 11> InstOps; 9472 InstOps.push_back(Chain); 9473 InstOps.push_back(SrcPtr); 9474 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 9475 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 9476 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 9477 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 9478 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 9479 VTs, &InstOps[0], InstOps.size(), 9480 MVT::i64, 9481 MachinePointerInfo(SV), 9482 /*Align=*/0, 9483 /*Volatile=*/false, 9484 /*ReadMem=*/true, 9485 /*WriteMem=*/true); 9486 Chain = VAARG.getValue(1); 9487 9488 // Load the next argument and return it 9489 return DAG.getLoad(ArgVT, dl, 9490 Chain, 9491 VAARG, 9492 MachinePointerInfo(), 9493 false, false, false, 0); 9494} 9495 9496SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 9497 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 9498 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 9499 SDValue Chain = Op.getOperand(0); 9500 SDValue DstPtr = Op.getOperand(1); 9501 SDValue SrcPtr = Op.getOperand(2); 9502 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 9503 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9504 DebugLoc DL = Op.getDebugLoc(); 9505 9506 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 9507 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 9508 false, 9509 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 9510} 9511 9512// getTargetVShiftNOde - Handle vector element shifts where the shift amount 9513// may or may not be a constant. Takes immediate version of shift as input. 9514static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, 9515 SDValue SrcOp, SDValue ShAmt, 9516 SelectionDAG &DAG) { 9517 assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); 9518 9519 if (isa<ConstantSDNode>(ShAmt)) { 9520 // Constant may be a TargetConstant. Use a regular constant. 9521 uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 9522 switch (Opc) { 9523 default: llvm_unreachable("Unknown target vector shift node"); 9524 case X86ISD::VSHLI: 9525 case X86ISD::VSRLI: 9526 case X86ISD::VSRAI: 9527 return DAG.getNode(Opc, dl, VT, SrcOp, 9528 DAG.getConstant(ShiftAmt, MVT::i32)); 9529 } 9530 } 9531 9532 // Change opcode to non-immediate version 9533 switch (Opc) { 9534 default: llvm_unreachable("Unknown target vector shift node"); 9535 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 9536 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 9537 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 9538 } 9539 9540 // Need to build a vector containing shift amount 9541 // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 9542 SDValue ShOps[4]; 9543 ShOps[0] = ShAmt; 9544 ShOps[1] = DAG.getConstant(0, MVT::i32); 9545 ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); 9546 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); 9547 9548 // The return type has to be a 128-bit type with the same element 9549 // type as the input type. 9550 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 9551 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); 9552 9553 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); 9554 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 9555} 9556 9557SDValue 9558X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 9559 DebugLoc dl = Op.getDebugLoc(); 9560 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9561 switch (IntNo) { 9562 default: return SDValue(); // Don't custom lower most intrinsics. 9563 // Comparison intrinsics. 9564 case Intrinsic::x86_sse_comieq_ss: 9565 case Intrinsic::x86_sse_comilt_ss: 9566 case Intrinsic::x86_sse_comile_ss: 9567 case Intrinsic::x86_sse_comigt_ss: 9568 case Intrinsic::x86_sse_comige_ss: 9569 case Intrinsic::x86_sse_comineq_ss: 9570 case Intrinsic::x86_sse_ucomieq_ss: 9571 case Intrinsic::x86_sse_ucomilt_ss: 9572 case Intrinsic::x86_sse_ucomile_ss: 9573 case Intrinsic::x86_sse_ucomigt_ss: 9574 case Intrinsic::x86_sse_ucomige_ss: 9575 case Intrinsic::x86_sse_ucomineq_ss: 9576 case Intrinsic::x86_sse2_comieq_sd: 9577 case Intrinsic::x86_sse2_comilt_sd: 9578 case Intrinsic::x86_sse2_comile_sd: 9579 case Intrinsic::x86_sse2_comigt_sd: 9580 case Intrinsic::x86_sse2_comige_sd: 9581 case Intrinsic::x86_sse2_comineq_sd: 9582 case Intrinsic::x86_sse2_ucomieq_sd: 9583 case Intrinsic::x86_sse2_ucomilt_sd: 9584 case Intrinsic::x86_sse2_ucomile_sd: 9585 case Intrinsic::x86_sse2_ucomigt_sd: 9586 case Intrinsic::x86_sse2_ucomige_sd: 9587 case Intrinsic::x86_sse2_ucomineq_sd: { 9588 unsigned Opc; 9589 ISD::CondCode CC; 9590 switch (IntNo) { 9591 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9592 case Intrinsic::x86_sse_comieq_ss: 9593 case Intrinsic::x86_sse2_comieq_sd: 9594 Opc = X86ISD::COMI; 9595 CC = ISD::SETEQ; 9596 break; 9597 case Intrinsic::x86_sse_comilt_ss: 9598 case Intrinsic::x86_sse2_comilt_sd: 9599 Opc = X86ISD::COMI; 9600 CC = ISD::SETLT; 9601 break; 9602 case Intrinsic::x86_sse_comile_ss: 9603 case Intrinsic::x86_sse2_comile_sd: 9604 Opc = X86ISD::COMI; 9605 CC = ISD::SETLE; 9606 break; 9607 case Intrinsic::x86_sse_comigt_ss: 9608 case Intrinsic::x86_sse2_comigt_sd: 9609 Opc = X86ISD::COMI; 9610 CC = ISD::SETGT; 9611 break; 9612 case Intrinsic::x86_sse_comige_ss: 9613 case Intrinsic::x86_sse2_comige_sd: 9614 Opc = X86ISD::COMI; 9615 CC = ISD::SETGE; 9616 break; 9617 case Intrinsic::x86_sse_comineq_ss: 9618 case Intrinsic::x86_sse2_comineq_sd: 9619 Opc = X86ISD::COMI; 9620 CC = ISD::SETNE; 9621 break; 9622 case Intrinsic::x86_sse_ucomieq_ss: 9623 case Intrinsic::x86_sse2_ucomieq_sd: 9624 Opc = X86ISD::UCOMI; 9625 CC = ISD::SETEQ; 9626 break; 9627 case Intrinsic::x86_sse_ucomilt_ss: 9628 case Intrinsic::x86_sse2_ucomilt_sd: 9629 Opc = X86ISD::UCOMI; 9630 CC = ISD::SETLT; 9631 break; 9632 case Intrinsic::x86_sse_ucomile_ss: 9633 case Intrinsic::x86_sse2_ucomile_sd: 9634 Opc = X86ISD::UCOMI; 9635 CC = ISD::SETLE; 9636 break; 9637 case Intrinsic::x86_sse_ucomigt_ss: 9638 case Intrinsic::x86_sse2_ucomigt_sd: 9639 Opc = X86ISD::UCOMI; 9640 CC = ISD::SETGT; 9641 break; 9642 case Intrinsic::x86_sse_ucomige_ss: 9643 case Intrinsic::x86_sse2_ucomige_sd: 9644 Opc = X86ISD::UCOMI; 9645 CC = ISD::SETGE; 9646 break; 9647 case Intrinsic::x86_sse_ucomineq_ss: 9648 case Intrinsic::x86_sse2_ucomineq_sd: 9649 Opc = X86ISD::UCOMI; 9650 CC = ISD::SETNE; 9651 break; 9652 } 9653 9654 SDValue LHS = Op.getOperand(1); 9655 SDValue RHS = Op.getOperand(2); 9656 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 9657 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 9658 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 9659 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9660 DAG.getConstant(X86CC, MVT::i8), Cond); 9661 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 9662 } 9663 9664 // Arithmetic intrinsics. 9665 case Intrinsic::x86_sse2_pmulu_dq: 9666 case Intrinsic::x86_avx2_pmulu_dq: 9667 return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), 9668 Op.getOperand(1), Op.getOperand(2)); 9669 9670 // SSE3/AVX horizontal add/sub intrinsics 9671 case Intrinsic::x86_sse3_hadd_ps: 9672 case Intrinsic::x86_sse3_hadd_pd: 9673 case Intrinsic::x86_avx_hadd_ps_256: 9674 case Intrinsic::x86_avx_hadd_pd_256: 9675 case Intrinsic::x86_sse3_hsub_ps: 9676 case Intrinsic::x86_sse3_hsub_pd: 9677 case Intrinsic::x86_avx_hsub_ps_256: 9678 case Intrinsic::x86_avx_hsub_pd_256: 9679 case Intrinsic::x86_ssse3_phadd_w_128: 9680 case Intrinsic::x86_ssse3_phadd_d_128: 9681 case Intrinsic::x86_avx2_phadd_w: 9682 case Intrinsic::x86_avx2_phadd_d: 9683 case Intrinsic::x86_ssse3_phsub_w_128: 9684 case Intrinsic::x86_ssse3_phsub_d_128: 9685 case Intrinsic::x86_avx2_phsub_w: 9686 case Intrinsic::x86_avx2_phsub_d: { 9687 unsigned Opcode; 9688 switch (IntNo) { 9689 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9690 case Intrinsic::x86_sse3_hadd_ps: 9691 case Intrinsic::x86_sse3_hadd_pd: 9692 case Intrinsic::x86_avx_hadd_ps_256: 9693 case Intrinsic::x86_avx_hadd_pd_256: 9694 Opcode = X86ISD::FHADD; 9695 break; 9696 case Intrinsic::x86_sse3_hsub_ps: 9697 case Intrinsic::x86_sse3_hsub_pd: 9698 case Intrinsic::x86_avx_hsub_ps_256: 9699 case Intrinsic::x86_avx_hsub_pd_256: 9700 Opcode = X86ISD::FHSUB; 9701 break; 9702 case Intrinsic::x86_ssse3_phadd_w_128: 9703 case Intrinsic::x86_ssse3_phadd_d_128: 9704 case Intrinsic::x86_avx2_phadd_w: 9705 case Intrinsic::x86_avx2_phadd_d: 9706 Opcode = X86ISD::HADD; 9707 break; 9708 case Intrinsic::x86_ssse3_phsub_w_128: 9709 case Intrinsic::x86_ssse3_phsub_d_128: 9710 case Intrinsic::x86_avx2_phsub_w: 9711 case Intrinsic::x86_avx2_phsub_d: 9712 Opcode = X86ISD::HSUB; 9713 break; 9714 } 9715 return DAG.getNode(Opcode, dl, Op.getValueType(), 9716 Op.getOperand(1), Op.getOperand(2)); 9717 } 9718 9719 // AVX2 variable shift intrinsics 9720 case Intrinsic::x86_avx2_psllv_d: 9721 case Intrinsic::x86_avx2_psllv_q: 9722 case Intrinsic::x86_avx2_psllv_d_256: 9723 case Intrinsic::x86_avx2_psllv_q_256: 9724 case Intrinsic::x86_avx2_psrlv_d: 9725 case Intrinsic::x86_avx2_psrlv_q: 9726 case Intrinsic::x86_avx2_psrlv_d_256: 9727 case Intrinsic::x86_avx2_psrlv_q_256: 9728 case Intrinsic::x86_avx2_psrav_d: 9729 case Intrinsic::x86_avx2_psrav_d_256: { 9730 unsigned Opcode; 9731 switch (IntNo) { 9732 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9733 case Intrinsic::x86_avx2_psllv_d: 9734 case Intrinsic::x86_avx2_psllv_q: 9735 case Intrinsic::x86_avx2_psllv_d_256: 9736 case Intrinsic::x86_avx2_psllv_q_256: 9737 Opcode = ISD::SHL; 9738 break; 9739 case Intrinsic::x86_avx2_psrlv_d: 9740 case Intrinsic::x86_avx2_psrlv_q: 9741 case Intrinsic::x86_avx2_psrlv_d_256: 9742 case Intrinsic::x86_avx2_psrlv_q_256: 9743 Opcode = ISD::SRL; 9744 break; 9745 case Intrinsic::x86_avx2_psrav_d: 9746 case Intrinsic::x86_avx2_psrav_d_256: 9747 Opcode = ISD::SRA; 9748 break; 9749 } 9750 return DAG.getNode(Opcode, dl, Op.getValueType(), 9751 Op.getOperand(1), Op.getOperand(2)); 9752 } 9753 9754 case Intrinsic::x86_ssse3_pshuf_b_128: 9755 case Intrinsic::x86_avx2_pshuf_b: 9756 return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), 9757 Op.getOperand(1), Op.getOperand(2)); 9758 9759 case Intrinsic::x86_ssse3_psign_b_128: 9760 case Intrinsic::x86_ssse3_psign_w_128: 9761 case Intrinsic::x86_ssse3_psign_d_128: 9762 case Intrinsic::x86_avx2_psign_b: 9763 case Intrinsic::x86_avx2_psign_w: 9764 case Intrinsic::x86_avx2_psign_d: 9765 return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), 9766 Op.getOperand(1), Op.getOperand(2)); 9767 9768 case Intrinsic::x86_sse41_insertps: 9769 return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), 9770 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 9771 9772 case Intrinsic::x86_avx_vperm2f128_ps_256: 9773 case Intrinsic::x86_avx_vperm2f128_pd_256: 9774 case Intrinsic::x86_avx_vperm2f128_si_256: 9775 case Intrinsic::x86_avx2_vperm2i128: 9776 return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), 9777 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 9778 9779 case Intrinsic::x86_avx2_permd: 9780 case Intrinsic::x86_avx2_permps: 9781 // Operands intentionally swapped. Mask is last operand to intrinsic, 9782 // but second operand for node/intruction. 9783 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), 9784 Op.getOperand(2), Op.getOperand(1)); 9785 9786 // ptest and testp intrinsics. The intrinsic these come from are designed to 9787 // return an integer value, not just an instruction so lower it to the ptest 9788 // or testp pattern and a setcc for the result. 9789 case Intrinsic::x86_sse41_ptestz: 9790 case Intrinsic::x86_sse41_ptestc: 9791 case Intrinsic::x86_sse41_ptestnzc: 9792 case Intrinsic::x86_avx_ptestz_256: 9793 case Intrinsic::x86_avx_ptestc_256: 9794 case Intrinsic::x86_avx_ptestnzc_256: 9795 case Intrinsic::x86_avx_vtestz_ps: 9796 case Intrinsic::x86_avx_vtestc_ps: 9797 case Intrinsic::x86_avx_vtestnzc_ps: 9798 case Intrinsic::x86_avx_vtestz_pd: 9799 case Intrinsic::x86_avx_vtestc_pd: 9800 case Intrinsic::x86_avx_vtestnzc_pd: 9801 case Intrinsic::x86_avx_vtestz_ps_256: 9802 case Intrinsic::x86_avx_vtestc_ps_256: 9803 case Intrinsic::x86_avx_vtestnzc_ps_256: 9804 case Intrinsic::x86_avx_vtestz_pd_256: 9805 case Intrinsic::x86_avx_vtestc_pd_256: 9806 case Intrinsic::x86_avx_vtestnzc_pd_256: { 9807 bool IsTestPacked = false; 9808 unsigned X86CC; 9809 switch (IntNo) { 9810 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 9811 case Intrinsic::x86_avx_vtestz_ps: 9812 case Intrinsic::x86_avx_vtestz_pd: 9813 case Intrinsic::x86_avx_vtestz_ps_256: 9814 case Intrinsic::x86_avx_vtestz_pd_256: 9815 IsTestPacked = true; // Fallthrough 9816 case Intrinsic::x86_sse41_ptestz: 9817 case Intrinsic::x86_avx_ptestz_256: 9818 // ZF = 1 9819 X86CC = X86::COND_E; 9820 break; 9821 case Intrinsic::x86_avx_vtestc_ps: 9822 case Intrinsic::x86_avx_vtestc_pd: 9823 case Intrinsic::x86_avx_vtestc_ps_256: 9824 case Intrinsic::x86_avx_vtestc_pd_256: 9825 IsTestPacked = true; // Fallthrough 9826 case Intrinsic::x86_sse41_ptestc: 9827 case Intrinsic::x86_avx_ptestc_256: 9828 // CF = 1 9829 X86CC = X86::COND_B; 9830 break; 9831 case Intrinsic::x86_avx_vtestnzc_ps: 9832 case Intrinsic::x86_avx_vtestnzc_pd: 9833 case Intrinsic::x86_avx_vtestnzc_ps_256: 9834 case Intrinsic::x86_avx_vtestnzc_pd_256: 9835 IsTestPacked = true; // Fallthrough 9836 case Intrinsic::x86_sse41_ptestnzc: 9837 case Intrinsic::x86_avx_ptestnzc_256: 9838 // ZF and CF = 0 9839 X86CC = X86::COND_A; 9840 break; 9841 } 9842 9843 SDValue LHS = Op.getOperand(1); 9844 SDValue RHS = Op.getOperand(2); 9845 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 9846 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 9847 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 9848 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 9849 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 9850 } 9851 9852 // SSE/AVX shift intrinsics 9853 case Intrinsic::x86_sse2_psll_w: 9854 case Intrinsic::x86_sse2_psll_d: 9855 case Intrinsic::x86_sse2_psll_q: 9856 case Intrinsic::x86_avx2_psll_w: 9857 case Intrinsic::x86_avx2_psll_d: 9858 case Intrinsic::x86_avx2_psll_q: 9859 case Intrinsic::x86_sse2_psrl_w: 9860 case Intrinsic::x86_sse2_psrl_d: 9861 case Intrinsic::x86_sse2_psrl_q: 9862 case Intrinsic::x86_avx2_psrl_w: 9863 case Intrinsic::x86_avx2_psrl_d: 9864 case Intrinsic::x86_avx2_psrl_q: 9865 case Intrinsic::x86_sse2_psra_w: 9866 case Intrinsic::x86_sse2_psra_d: 9867 case Intrinsic::x86_avx2_psra_w: 9868 case Intrinsic::x86_avx2_psra_d: { 9869 unsigned Opcode; 9870 switch (IntNo) { 9871 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9872 case Intrinsic::x86_sse2_psll_w: 9873 case Intrinsic::x86_sse2_psll_d: 9874 case Intrinsic::x86_sse2_psll_q: 9875 case Intrinsic::x86_avx2_psll_w: 9876 case Intrinsic::x86_avx2_psll_d: 9877 case Intrinsic::x86_avx2_psll_q: 9878 Opcode = X86ISD::VSHL; 9879 break; 9880 case Intrinsic::x86_sse2_psrl_w: 9881 case Intrinsic::x86_sse2_psrl_d: 9882 case Intrinsic::x86_sse2_psrl_q: 9883 case Intrinsic::x86_avx2_psrl_w: 9884 case Intrinsic::x86_avx2_psrl_d: 9885 case Intrinsic::x86_avx2_psrl_q: 9886 Opcode = X86ISD::VSRL; 9887 break; 9888 case Intrinsic::x86_sse2_psra_w: 9889 case Intrinsic::x86_sse2_psra_d: 9890 case Intrinsic::x86_avx2_psra_w: 9891 case Intrinsic::x86_avx2_psra_d: 9892 Opcode = X86ISD::VSRA; 9893 break; 9894 } 9895 return DAG.getNode(Opcode, dl, Op.getValueType(), 9896 Op.getOperand(1), Op.getOperand(2)); 9897 } 9898 9899 // SSE/AVX immediate shift intrinsics 9900 case Intrinsic::x86_sse2_pslli_w: 9901 case Intrinsic::x86_sse2_pslli_d: 9902 case Intrinsic::x86_sse2_pslli_q: 9903 case Intrinsic::x86_avx2_pslli_w: 9904 case Intrinsic::x86_avx2_pslli_d: 9905 case Intrinsic::x86_avx2_pslli_q: 9906 case Intrinsic::x86_sse2_psrli_w: 9907 case Intrinsic::x86_sse2_psrli_d: 9908 case Intrinsic::x86_sse2_psrli_q: 9909 case Intrinsic::x86_avx2_psrli_w: 9910 case Intrinsic::x86_avx2_psrli_d: 9911 case Intrinsic::x86_avx2_psrli_q: 9912 case Intrinsic::x86_sse2_psrai_w: 9913 case Intrinsic::x86_sse2_psrai_d: 9914 case Intrinsic::x86_avx2_psrai_w: 9915 case Intrinsic::x86_avx2_psrai_d: { 9916 unsigned Opcode; 9917 switch (IntNo) { 9918 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9919 case Intrinsic::x86_sse2_pslli_w: 9920 case Intrinsic::x86_sse2_pslli_d: 9921 case Intrinsic::x86_sse2_pslli_q: 9922 case Intrinsic::x86_avx2_pslli_w: 9923 case Intrinsic::x86_avx2_pslli_d: 9924 case Intrinsic::x86_avx2_pslli_q: 9925 Opcode = X86ISD::VSHLI; 9926 break; 9927 case Intrinsic::x86_sse2_psrli_w: 9928 case Intrinsic::x86_sse2_psrli_d: 9929 case Intrinsic::x86_sse2_psrli_q: 9930 case Intrinsic::x86_avx2_psrli_w: 9931 case Intrinsic::x86_avx2_psrli_d: 9932 case Intrinsic::x86_avx2_psrli_q: 9933 Opcode = X86ISD::VSRLI; 9934 break; 9935 case Intrinsic::x86_sse2_psrai_w: 9936 case Intrinsic::x86_sse2_psrai_d: 9937 case Intrinsic::x86_avx2_psrai_w: 9938 case Intrinsic::x86_avx2_psrai_d: 9939 Opcode = X86ISD::VSRAI; 9940 break; 9941 } 9942 return getTargetVShiftNode(Opcode, dl, Op.getValueType(), 9943 Op.getOperand(1), Op.getOperand(2), DAG); 9944 } 9945 9946 case Intrinsic::x86_sse42_pcmpistria128: 9947 case Intrinsic::x86_sse42_pcmpestria128: 9948 case Intrinsic::x86_sse42_pcmpistric128: 9949 case Intrinsic::x86_sse42_pcmpestric128: 9950 case Intrinsic::x86_sse42_pcmpistrio128: 9951 case Intrinsic::x86_sse42_pcmpestrio128: 9952 case Intrinsic::x86_sse42_pcmpistris128: 9953 case Intrinsic::x86_sse42_pcmpestris128: 9954 case Intrinsic::x86_sse42_pcmpistriz128: 9955 case Intrinsic::x86_sse42_pcmpestriz128: { 9956 unsigned Opcode; 9957 unsigned X86CC; 9958 switch (IntNo) { 9959 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9960 case Intrinsic::x86_sse42_pcmpistria128: 9961 Opcode = X86ISD::PCMPISTRI; 9962 X86CC = X86::COND_A; 9963 break; 9964 case Intrinsic::x86_sse42_pcmpestria128: 9965 Opcode = X86ISD::PCMPESTRI; 9966 X86CC = X86::COND_A; 9967 break; 9968 case Intrinsic::x86_sse42_pcmpistric128: 9969 Opcode = X86ISD::PCMPISTRI; 9970 X86CC = X86::COND_B; 9971 break; 9972 case Intrinsic::x86_sse42_pcmpestric128: 9973 Opcode = X86ISD::PCMPESTRI; 9974 X86CC = X86::COND_B; 9975 break; 9976 case Intrinsic::x86_sse42_pcmpistrio128: 9977 Opcode = X86ISD::PCMPISTRI; 9978 X86CC = X86::COND_O; 9979 break; 9980 case Intrinsic::x86_sse42_pcmpestrio128: 9981 Opcode = X86ISD::PCMPESTRI; 9982 X86CC = X86::COND_O; 9983 break; 9984 case Intrinsic::x86_sse42_pcmpistris128: 9985 Opcode = X86ISD::PCMPISTRI; 9986 X86CC = X86::COND_S; 9987 break; 9988 case Intrinsic::x86_sse42_pcmpestris128: 9989 Opcode = X86ISD::PCMPESTRI; 9990 X86CC = X86::COND_S; 9991 break; 9992 case Intrinsic::x86_sse42_pcmpistriz128: 9993 Opcode = X86ISD::PCMPISTRI; 9994 X86CC = X86::COND_E; 9995 break; 9996 case Intrinsic::x86_sse42_pcmpestriz128: 9997 Opcode = X86ISD::PCMPESTRI; 9998 X86CC = X86::COND_E; 9999 break; 10000 } 10001 SmallVector<SDValue, 5> NewOps; 10002 NewOps.append(Op->op_begin()+1, Op->op_end()); 10003 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 10004 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 10005 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 10006 DAG.getConstant(X86CC, MVT::i8), 10007 SDValue(PCMP.getNode(), 1)); 10008 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 10009 } 10010 10011 case Intrinsic::x86_sse42_pcmpistri128: 10012 case Intrinsic::x86_sse42_pcmpestri128: { 10013 unsigned Opcode; 10014 if (IntNo == Intrinsic::x86_sse42_pcmpistri128) 10015 Opcode = X86ISD::PCMPISTRI; 10016 else 10017 Opcode = X86ISD::PCMPESTRI; 10018 10019 SmallVector<SDValue, 5> NewOps; 10020 NewOps.append(Op->op_begin()+1, Op->op_end()); 10021 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 10022 return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); 10023 } 10024 case Intrinsic::x86_fma_vfmadd_ps: 10025 case Intrinsic::x86_fma_vfmadd_pd: 10026 case Intrinsic::x86_fma_vfmsub_ps: 10027 case Intrinsic::x86_fma_vfmsub_pd: 10028 case Intrinsic::x86_fma_vfnmadd_ps: 10029 case Intrinsic::x86_fma_vfnmadd_pd: 10030 case Intrinsic::x86_fma_vfnmsub_ps: 10031 case Intrinsic::x86_fma_vfnmsub_pd: 10032 case Intrinsic::x86_fma_vfmaddsub_ps: 10033 case Intrinsic::x86_fma_vfmaddsub_pd: 10034 case Intrinsic::x86_fma_vfmsubadd_ps: 10035 case Intrinsic::x86_fma_vfmsubadd_pd: 10036 case Intrinsic::x86_fma_vfmadd_ps_256: 10037 case Intrinsic::x86_fma_vfmadd_pd_256: 10038 case Intrinsic::x86_fma_vfmsub_ps_256: 10039 case Intrinsic::x86_fma_vfmsub_pd_256: 10040 case Intrinsic::x86_fma_vfnmadd_ps_256: 10041 case Intrinsic::x86_fma_vfnmadd_pd_256: 10042 case Intrinsic::x86_fma_vfnmsub_ps_256: 10043 case Intrinsic::x86_fma_vfnmsub_pd_256: 10044 case Intrinsic::x86_fma_vfmaddsub_ps_256: 10045 case Intrinsic::x86_fma_vfmaddsub_pd_256: 10046 case Intrinsic::x86_fma_vfmsubadd_ps_256: 10047 case Intrinsic::x86_fma_vfmsubadd_pd_256: { 10048 unsigned Opc; 10049 switch (IntNo) { 10050 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 10051 case Intrinsic::x86_fma_vfmadd_ps: 10052 case Intrinsic::x86_fma_vfmadd_pd: 10053 case Intrinsic::x86_fma_vfmadd_ps_256: 10054 case Intrinsic::x86_fma_vfmadd_pd_256: 10055 Opc = X86ISD::FMADD; 10056 break; 10057 case Intrinsic::x86_fma_vfmsub_ps: 10058 case Intrinsic::x86_fma_vfmsub_pd: 10059 case Intrinsic::x86_fma_vfmsub_ps_256: 10060 case Intrinsic::x86_fma_vfmsub_pd_256: 10061 Opc = X86ISD::FMSUB; 10062 break; 10063 case Intrinsic::x86_fma_vfnmadd_ps: 10064 case Intrinsic::x86_fma_vfnmadd_pd: 10065 case Intrinsic::x86_fma_vfnmadd_ps_256: 10066 case Intrinsic::x86_fma_vfnmadd_pd_256: 10067 Opc = X86ISD::FNMADD; 10068 break; 10069 case Intrinsic::x86_fma_vfnmsub_ps: 10070 case Intrinsic::x86_fma_vfnmsub_pd: 10071 case Intrinsic::x86_fma_vfnmsub_ps_256: 10072 case Intrinsic::x86_fma_vfnmsub_pd_256: 10073 Opc = X86ISD::FNMSUB; 10074 break; 10075 case Intrinsic::x86_fma_vfmaddsub_ps: 10076 case Intrinsic::x86_fma_vfmaddsub_pd: 10077 case Intrinsic::x86_fma_vfmaddsub_ps_256: 10078 case Intrinsic::x86_fma_vfmaddsub_pd_256: 10079 Opc = X86ISD::FMADDSUB; 10080 break; 10081 case Intrinsic::x86_fma_vfmsubadd_ps: 10082 case Intrinsic::x86_fma_vfmsubadd_pd: 10083 case Intrinsic::x86_fma_vfmsubadd_ps_256: 10084 case Intrinsic::x86_fma_vfmsubadd_pd_256: 10085 Opc = X86ISD::FMSUBADD; 10086 break; 10087 } 10088 10089 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), 10090 Op.getOperand(2), Op.getOperand(3)); 10091 } 10092 } 10093} 10094 10095SDValue 10096X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { 10097 DebugLoc dl = Op.getDebugLoc(); 10098 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10099 switch (IntNo) { 10100 default: return SDValue(); // Don't custom lower most intrinsics. 10101 10102 // RDRAND intrinsics. 10103 case Intrinsic::x86_rdrand_16: 10104 case Intrinsic::x86_rdrand_32: 10105 case Intrinsic::x86_rdrand_64: { 10106 // Emit the node with the right value type. 10107 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); 10108 SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0)); 10109 10110 // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise 10111 // return the value from Rand, which is always 0, casted to i32. 10112 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), 10113 DAG.getConstant(1, Op->getValueType(1)), 10114 DAG.getConstant(X86::COND_B, MVT::i32), 10115 SDValue(Result.getNode(), 1) }; 10116 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, 10117 DAG.getVTList(Op->getValueType(1), MVT::Glue), 10118 Ops, 4); 10119 10120 // Return { result, isValid, chain }. 10121 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, 10122 SDValue(Result.getNode(), 2)); 10123 } 10124 } 10125} 10126 10127SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 10128 SelectionDAG &DAG) const { 10129 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 10130 MFI->setReturnAddressIsTaken(true); 10131 10132 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 10133 DebugLoc dl = Op.getDebugLoc(); 10134 10135 if (Depth > 0) { 10136 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 10137 SDValue Offset = 10138 DAG.getConstant(TD->getPointerSize(), 10139 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 10140 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 10141 DAG.getNode(ISD::ADD, dl, getPointerTy(), 10142 FrameAddr, Offset), 10143 MachinePointerInfo(), false, false, false, 0); 10144 } 10145 10146 // Just load the return address. 10147 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 10148 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 10149 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 10150} 10151 10152SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 10153 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 10154 MFI->setFrameAddressIsTaken(true); 10155 10156 EVT VT = Op.getValueType(); 10157 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 10158 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 10159 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 10160 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 10161 while (Depth--) 10162 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 10163 MachinePointerInfo(), 10164 false, false, false, 0); 10165 return FrameAddr; 10166} 10167 10168SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 10169 SelectionDAG &DAG) const { 10170 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 10171} 10172 10173SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 10174 SDValue Chain = Op.getOperand(0); 10175 SDValue Offset = Op.getOperand(1); 10176 SDValue Handler = Op.getOperand(2); 10177 DebugLoc dl = Op.getDebugLoc(); 10178 10179 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 10180 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 10181 getPointerTy()); 10182 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 10183 10184 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 10185 DAG.getIntPtrConstant(TD->getPointerSize())); 10186 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 10187 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 10188 false, false, 0); 10189 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 10190 10191 return DAG.getNode(X86ISD::EH_RETURN, dl, 10192 MVT::Other, 10193 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 10194} 10195 10196SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, 10197 SelectionDAG &DAG) const { 10198 return Op.getOperand(0); 10199} 10200 10201SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 10202 SelectionDAG &DAG) const { 10203 SDValue Root = Op.getOperand(0); 10204 SDValue Trmp = Op.getOperand(1); // trampoline 10205 SDValue FPtr = Op.getOperand(2); // nested function 10206 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 10207 DebugLoc dl = Op.getDebugLoc(); 10208 10209 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 10210 10211 if (Subtarget->is64Bit()) { 10212 SDValue OutChains[6]; 10213 10214 // Large code-model. 10215 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 10216 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 10217 10218 const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); 10219 const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); 10220 10221 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 10222 10223 // Load the pointer to the nested function into R11. 10224 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 10225 SDValue Addr = Trmp; 10226 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 10227 Addr, MachinePointerInfo(TrmpAddr), 10228 false, false, 0); 10229 10230 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10231 DAG.getConstant(2, MVT::i64)); 10232 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 10233 MachinePointerInfo(TrmpAddr, 2), 10234 false, false, 2); 10235 10236 // Load the 'nest' parameter value into R10. 10237 // R10 is specified in X86CallingConv.td 10238 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 10239 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10240 DAG.getConstant(10, MVT::i64)); 10241 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 10242 Addr, MachinePointerInfo(TrmpAddr, 10), 10243 false, false, 0); 10244 10245 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10246 DAG.getConstant(12, MVT::i64)); 10247 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 10248 MachinePointerInfo(TrmpAddr, 12), 10249 false, false, 2); 10250 10251 // Jump to the nested function. 10252 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 10253 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10254 DAG.getConstant(20, MVT::i64)); 10255 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 10256 Addr, MachinePointerInfo(TrmpAddr, 20), 10257 false, false, 0); 10258 10259 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 10260 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 10261 DAG.getConstant(22, MVT::i64)); 10262 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 10263 MachinePointerInfo(TrmpAddr, 22), 10264 false, false, 0); 10265 10266 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); 10267 } else { 10268 const Function *Func = 10269 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 10270 CallingConv::ID CC = Func->getCallingConv(); 10271 unsigned NestReg; 10272 10273 switch (CC) { 10274 default: 10275 llvm_unreachable("Unsupported calling convention"); 10276 case CallingConv::C: 10277 case CallingConv::X86_StdCall: { 10278 // Pass 'nest' parameter in ECX. 10279 // Must be kept in sync with X86CallingConv.td 10280 NestReg = X86::ECX; 10281 10282 // Check that ECX wasn't needed by an 'inreg' parameter. 10283 FunctionType *FTy = Func->getFunctionType(); 10284 const AttrListPtr &Attrs = Func->getAttributes(); 10285 10286 if (!Attrs.isEmpty() && !Func->isVarArg()) { 10287 unsigned InRegCount = 0; 10288 unsigned Idx = 1; 10289 10290 for (FunctionType::param_iterator I = FTy->param_begin(), 10291 E = FTy->param_end(); I != E; ++I, ++Idx) 10292 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 10293 // FIXME: should only count parameters that are lowered to integers. 10294 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 10295 10296 if (InRegCount > 2) { 10297 report_fatal_error("Nest register in use - reduce number of inreg" 10298 " parameters!"); 10299 } 10300 } 10301 break; 10302 } 10303 case CallingConv::X86_FastCall: 10304 case CallingConv::X86_ThisCall: 10305 case CallingConv::Fast: 10306 // Pass 'nest' parameter in EAX. 10307 // Must be kept in sync with X86CallingConv.td 10308 NestReg = X86::EAX; 10309 break; 10310 } 10311 10312 SDValue OutChains[4]; 10313 SDValue Addr, Disp; 10314 10315 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10316 DAG.getConstant(10, MVT::i32)); 10317 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 10318 10319 // This is storing the opcode for MOV32ri. 10320 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 10321 const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); 10322 OutChains[0] = DAG.getStore(Root, dl, 10323 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 10324 Trmp, MachinePointerInfo(TrmpAddr), 10325 false, false, 0); 10326 10327 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10328 DAG.getConstant(1, MVT::i32)); 10329 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 10330 MachinePointerInfo(TrmpAddr, 1), 10331 false, false, 1); 10332 10333 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 10334 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10335 DAG.getConstant(5, MVT::i32)); 10336 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 10337 MachinePointerInfo(TrmpAddr, 5), 10338 false, false, 1); 10339 10340 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 10341 DAG.getConstant(6, MVT::i32)); 10342 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 10343 MachinePointerInfo(TrmpAddr, 6), 10344 false, false, 1); 10345 10346 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); 10347 } 10348} 10349 10350SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 10351 SelectionDAG &DAG) const { 10352 /* 10353 The rounding mode is in bits 11:10 of FPSR, and has the following 10354 settings: 10355 00 Round to nearest 10356 01 Round to -inf 10357 10 Round to +inf 10358 11 Round to 0 10359 10360 FLT_ROUNDS, on the other hand, expects the following: 10361 -1 Undefined 10362 0 Round to 0 10363 1 Round to nearest 10364 2 Round to +inf 10365 3 Round to -inf 10366 10367 To perform the conversion, we do: 10368 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 10369 */ 10370 10371 MachineFunction &MF = DAG.getMachineFunction(); 10372 const TargetMachine &TM = MF.getTarget(); 10373 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 10374 unsigned StackAlignment = TFI.getStackAlignment(); 10375 EVT VT = Op.getValueType(); 10376 DebugLoc DL = Op.getDebugLoc(); 10377 10378 // Save FP Control Word to stack slot 10379 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 10380 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 10381 10382 10383 MachineMemOperand *MMO = 10384 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 10385 MachineMemOperand::MOStore, 2, 2); 10386 10387 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 10388 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 10389 DAG.getVTList(MVT::Other), 10390 Ops, 2, MVT::i16, MMO); 10391 10392 // Load FP Control Word from stack slot 10393 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 10394 MachinePointerInfo(), false, false, false, 0); 10395 10396 // Transform as necessary 10397 SDValue CWD1 = 10398 DAG.getNode(ISD::SRL, DL, MVT::i16, 10399 DAG.getNode(ISD::AND, DL, MVT::i16, 10400 CWD, DAG.getConstant(0x800, MVT::i16)), 10401 DAG.getConstant(11, MVT::i8)); 10402 SDValue CWD2 = 10403 DAG.getNode(ISD::SRL, DL, MVT::i16, 10404 DAG.getNode(ISD::AND, DL, MVT::i16, 10405 CWD, DAG.getConstant(0x400, MVT::i16)), 10406 DAG.getConstant(9, MVT::i8)); 10407 10408 SDValue RetVal = 10409 DAG.getNode(ISD::AND, DL, MVT::i16, 10410 DAG.getNode(ISD::ADD, DL, MVT::i16, 10411 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 10412 DAG.getConstant(1, MVT::i16)), 10413 DAG.getConstant(3, MVT::i16)); 10414 10415 10416 return DAG.getNode((VT.getSizeInBits() < 16 ? 10417 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 10418} 10419 10420SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 10421 EVT VT = Op.getValueType(); 10422 EVT OpVT = VT; 10423 unsigned NumBits = VT.getSizeInBits(); 10424 DebugLoc dl = Op.getDebugLoc(); 10425 10426 Op = Op.getOperand(0); 10427 if (VT == MVT::i8) { 10428 // Zero extend to i32 since there is not an i8 bsr. 10429 OpVT = MVT::i32; 10430 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 10431 } 10432 10433 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 10434 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 10435 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 10436 10437 // If src is zero (i.e. bsr sets ZF), returns NumBits. 10438 SDValue Ops[] = { 10439 Op, 10440 DAG.getConstant(NumBits+NumBits-1, OpVT), 10441 DAG.getConstant(X86::COND_E, MVT::i8), 10442 Op.getValue(1) 10443 }; 10444 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 10445 10446 // Finally xor with NumBits-1. 10447 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 10448 10449 if (VT == MVT::i8) 10450 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 10451 return Op; 10452} 10453 10454SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, 10455 SelectionDAG &DAG) const { 10456 EVT VT = Op.getValueType(); 10457 EVT OpVT = VT; 10458 unsigned NumBits = VT.getSizeInBits(); 10459 DebugLoc dl = Op.getDebugLoc(); 10460 10461 Op = Op.getOperand(0); 10462 if (VT == MVT::i8) { 10463 // Zero extend to i32 since there is not an i8 bsr. 10464 OpVT = MVT::i32; 10465 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 10466 } 10467 10468 // Issue a bsr (scan bits in reverse). 10469 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 10470 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 10471 10472 // And xor with NumBits-1. 10473 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 10474 10475 if (VT == MVT::i8) 10476 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 10477 return Op; 10478} 10479 10480SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 10481 EVT VT = Op.getValueType(); 10482 unsigned NumBits = VT.getSizeInBits(); 10483 DebugLoc dl = Op.getDebugLoc(); 10484 Op = Op.getOperand(0); 10485 10486 // Issue a bsf (scan bits forward) which also sets EFLAGS. 10487 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 10488 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 10489 10490 // If src is zero (i.e. bsf sets ZF), returns NumBits. 10491 SDValue Ops[] = { 10492 Op, 10493 DAG.getConstant(NumBits, VT), 10494 DAG.getConstant(X86::COND_E, MVT::i8), 10495 Op.getValue(1) 10496 }; 10497 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); 10498} 10499 10500// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 10501// ones, and then concatenate the result back. 10502static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 10503 EVT VT = Op.getValueType(); 10504 10505 assert(VT.is256BitVector() && VT.isInteger() && 10506 "Unsupported value type for operation"); 10507 10508 unsigned NumElems = VT.getVectorNumElements(); 10509 DebugLoc dl = Op.getDebugLoc(); 10510 10511 // Extract the LHS vectors 10512 SDValue LHS = Op.getOperand(0); 10513 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 10514 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 10515 10516 // Extract the RHS vectors 10517 SDValue RHS = Op.getOperand(1); 10518 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 10519 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 10520 10521 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10522 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 10523 10524 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 10525 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 10526 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 10527} 10528 10529SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const { 10530 assert(Op.getValueType().is256BitVector() && 10531 Op.getValueType().isInteger() && 10532 "Only handle AVX 256-bit vector integer operation"); 10533 return Lower256IntArith(Op, DAG); 10534} 10535 10536SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const { 10537 assert(Op.getValueType().is256BitVector() && 10538 Op.getValueType().isInteger() && 10539 "Only handle AVX 256-bit vector integer operation"); 10540 return Lower256IntArith(Op, DAG); 10541} 10542 10543SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 10544 EVT VT = Op.getValueType(); 10545 10546 // Decompose 256-bit ops into smaller 128-bit ops. 10547 if (VT.is256BitVector() && !Subtarget->hasAVX2()) 10548 return Lower256IntArith(Op, DAG); 10549 10550 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && 10551 "Only know how to lower V2I64/V4I64 multiply"); 10552 10553 DebugLoc dl = Op.getDebugLoc(); 10554 10555 // Ahi = psrlqi(a, 32); 10556 // Bhi = psrlqi(b, 32); 10557 // 10558 // AloBlo = pmuludq(a, b); 10559 // AloBhi = pmuludq(a, Bhi); 10560 // AhiBlo = pmuludq(Ahi, b); 10561 10562 // AloBhi = psllqi(AloBhi, 32); 10563 // AhiBlo = psllqi(AhiBlo, 32); 10564 // return AloBlo + AloBhi + AhiBlo; 10565 10566 SDValue A = Op.getOperand(0); 10567 SDValue B = Op.getOperand(1); 10568 10569 SDValue ShAmt = DAG.getConstant(32, MVT::i32); 10570 10571 SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); 10572 SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); 10573 10574 // Bit cast to 32-bit vectors for MULUDQ 10575 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; 10576 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); 10577 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); 10578 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); 10579 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); 10580 10581 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 10582 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 10583 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 10584 10585 AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); 10586 AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); 10587 10588 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 10589 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 10590} 10591 10592SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 10593 10594 EVT VT = Op.getValueType(); 10595 DebugLoc dl = Op.getDebugLoc(); 10596 SDValue R = Op.getOperand(0); 10597 SDValue Amt = Op.getOperand(1); 10598 LLVMContext *Context = DAG.getContext(); 10599 10600 if (!Subtarget->hasSSE2()) 10601 return SDValue(); 10602 10603 // Optimize shl/srl/sra with constant shift amount. 10604 if (isSplatVector(Amt.getNode())) { 10605 SDValue SclrAmt = Amt->getOperand(0); 10606 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 10607 uint64_t ShiftAmt = C->getZExtValue(); 10608 10609 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 10610 (Subtarget->hasAVX2() && 10611 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { 10612 if (Op.getOpcode() == ISD::SHL) 10613 return DAG.getNode(X86ISD::VSHLI, dl, VT, R, 10614 DAG.getConstant(ShiftAmt, MVT::i32)); 10615 if (Op.getOpcode() == ISD::SRL) 10616 return DAG.getNode(X86ISD::VSRLI, dl, VT, R, 10617 DAG.getConstant(ShiftAmt, MVT::i32)); 10618 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) 10619 return DAG.getNode(X86ISD::VSRAI, dl, VT, R, 10620 DAG.getConstant(ShiftAmt, MVT::i32)); 10621 } 10622 10623 if (VT == MVT::v16i8) { 10624 if (Op.getOpcode() == ISD::SHL) { 10625 // Make a large shift. 10626 SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R, 10627 DAG.getConstant(ShiftAmt, MVT::i32)); 10628 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 10629 // Zero out the rightmost bits. 10630 SmallVector<SDValue, 16> V(16, 10631 DAG.getConstant(uint8_t(-1U << ShiftAmt), 10632 MVT::i8)); 10633 return DAG.getNode(ISD::AND, dl, VT, SHL, 10634 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 10635 } 10636 if (Op.getOpcode() == ISD::SRL) { 10637 // Make a large shift. 10638 SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R, 10639 DAG.getConstant(ShiftAmt, MVT::i32)); 10640 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 10641 // Zero out the leftmost bits. 10642 SmallVector<SDValue, 16> V(16, 10643 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 10644 MVT::i8)); 10645 return DAG.getNode(ISD::AND, dl, VT, SRL, 10646 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 10647 } 10648 if (Op.getOpcode() == ISD::SRA) { 10649 if (ShiftAmt == 7) { 10650 // R s>> 7 === R s< 0 10651 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 10652 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 10653 } 10654 10655 // R s>> a === ((R u>> a) ^ m) - m 10656 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 10657 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, 10658 MVT::i8)); 10659 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); 10660 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 10661 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 10662 return Res; 10663 } 10664 llvm_unreachable("Unknown shift opcode."); 10665 } 10666 10667 if (Subtarget->hasAVX2() && VT == MVT::v32i8) { 10668 if (Op.getOpcode() == ISD::SHL) { 10669 // Make a large shift. 10670 SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, 10671 DAG.getConstant(ShiftAmt, MVT::i32)); 10672 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 10673 // Zero out the rightmost bits. 10674 SmallVector<SDValue, 32> V(32, 10675 DAG.getConstant(uint8_t(-1U << ShiftAmt), 10676 MVT::i8)); 10677 return DAG.getNode(ISD::AND, dl, VT, SHL, 10678 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 10679 } 10680 if (Op.getOpcode() == ISD::SRL) { 10681 // Make a large shift. 10682 SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R, 10683 DAG.getConstant(ShiftAmt, MVT::i32)); 10684 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 10685 // Zero out the leftmost bits. 10686 SmallVector<SDValue, 32> V(32, 10687 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 10688 MVT::i8)); 10689 return DAG.getNode(ISD::AND, dl, VT, SRL, 10690 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 10691 } 10692 if (Op.getOpcode() == ISD::SRA) { 10693 if (ShiftAmt == 7) { 10694 // R s>> 7 === R s< 0 10695 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 10696 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 10697 } 10698 10699 // R s>> a === ((R u>> a) ^ m) - m 10700 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 10701 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, 10702 MVT::i8)); 10703 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); 10704 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 10705 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 10706 return Res; 10707 } 10708 llvm_unreachable("Unknown shift opcode."); 10709 } 10710 } 10711 } 10712 10713 // Lower SHL with variable shift amount. 10714 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 10715 Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1), 10716 DAG.getConstant(23, MVT::i32)); 10717 10718 const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U}; 10719 Constant *C = ConstantDataVector::get(*Context, CV); 10720 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 10721 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 10722 MachinePointerInfo::getConstantPool(), 10723 false, false, false, 16); 10724 10725 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 10726 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 10727 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 10728 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 10729 } 10730 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 10731 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); 10732 10733 // a = a << 5; 10734 Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1), 10735 DAG.getConstant(5, MVT::i32)); 10736 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); 10737 10738 // Turn 'a' into a mask suitable for VSELECT 10739 SDValue VSelM = DAG.getConstant(0x80, VT); 10740 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10741 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 10742 10743 SDValue CM1 = DAG.getConstant(0x0f, VT); 10744 SDValue CM2 = DAG.getConstant(0x3f, VT); 10745 10746 // r = VSELECT(r, psllw(r & (char16)15, 4), a); 10747 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); 10748 M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 10749 DAG.getConstant(4, MVT::i32), DAG); 10750 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 10751 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 10752 10753 // a += a 10754 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 10755 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10756 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 10757 10758 // r = VSELECT(r, psllw(r & (char16)63, 2), a); 10759 M = DAG.getNode(ISD::AND, dl, VT, R, CM2); 10760 M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 10761 DAG.getConstant(2, MVT::i32), DAG); 10762 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 10763 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 10764 10765 // a += a 10766 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 10767 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10768 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 10769 10770 // return VSELECT(r, r+r, a); 10771 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, 10772 DAG.getNode(ISD::ADD, dl, VT, R, R), R); 10773 return R; 10774 } 10775 10776 // Decompose 256-bit shifts into smaller 128-bit shifts. 10777 if (VT.is256BitVector()) { 10778 unsigned NumElems = VT.getVectorNumElements(); 10779 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10780 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 10781 10782 // Extract the two vectors 10783 SDValue V1 = Extract128BitVector(R, 0, DAG, dl); 10784 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); 10785 10786 // Recreate the shift amount vectors 10787 SDValue Amt1, Amt2; 10788 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 10789 // Constant shift amount 10790 SmallVector<SDValue, 4> Amt1Csts; 10791 SmallVector<SDValue, 4> Amt2Csts; 10792 for (unsigned i = 0; i != NumElems/2; ++i) 10793 Amt1Csts.push_back(Amt->getOperand(i)); 10794 for (unsigned i = NumElems/2; i != NumElems; ++i) 10795 Amt2Csts.push_back(Amt->getOperand(i)); 10796 10797 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 10798 &Amt1Csts[0], NumElems/2); 10799 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 10800 &Amt2Csts[0], NumElems/2); 10801 } else { 10802 // Variable shift amount 10803 Amt1 = Extract128BitVector(Amt, 0, DAG, dl); 10804 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); 10805 } 10806 10807 // Issue new vector shifts for the smaller types 10808 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 10809 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 10810 10811 // Concatenate the result back 10812 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 10813 } 10814 10815 return SDValue(); 10816} 10817 10818SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 10819 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 10820 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 10821 // looks for this combo and may remove the "setcc" instruction if the "setcc" 10822 // has only one use. 10823 SDNode *N = Op.getNode(); 10824 SDValue LHS = N->getOperand(0); 10825 SDValue RHS = N->getOperand(1); 10826 unsigned BaseOp = 0; 10827 unsigned Cond = 0; 10828 DebugLoc DL = Op.getDebugLoc(); 10829 switch (Op.getOpcode()) { 10830 default: llvm_unreachable("Unknown ovf instruction!"); 10831 case ISD::SADDO: 10832 // A subtract of one will be selected as a INC. Note that INC doesn't 10833 // set CF, so we can't do this for UADDO. 10834 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 10835 if (C->isOne()) { 10836 BaseOp = X86ISD::INC; 10837 Cond = X86::COND_O; 10838 break; 10839 } 10840 BaseOp = X86ISD::ADD; 10841 Cond = X86::COND_O; 10842 break; 10843 case ISD::UADDO: 10844 BaseOp = X86ISD::ADD; 10845 Cond = X86::COND_B; 10846 break; 10847 case ISD::SSUBO: 10848 // A subtract of one will be selected as a DEC. Note that DEC doesn't 10849 // set CF, so we can't do this for USUBO. 10850 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 10851 if (C->isOne()) { 10852 BaseOp = X86ISD::DEC; 10853 Cond = X86::COND_O; 10854 break; 10855 } 10856 BaseOp = X86ISD::SUB; 10857 Cond = X86::COND_O; 10858 break; 10859 case ISD::USUBO: 10860 BaseOp = X86ISD::SUB; 10861 Cond = X86::COND_B; 10862 break; 10863 case ISD::SMULO: 10864 BaseOp = X86ISD::SMUL; 10865 Cond = X86::COND_O; 10866 break; 10867 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 10868 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 10869 MVT::i32); 10870 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 10871 10872 SDValue SetCC = 10873 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10874 DAG.getConstant(X86::COND_O, MVT::i32), 10875 SDValue(Sum.getNode(), 2)); 10876 10877 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 10878 } 10879 } 10880 10881 // Also sets EFLAGS. 10882 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 10883 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 10884 10885 SDValue SetCC = 10886 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 10887 DAG.getConstant(Cond, MVT::i32), 10888 SDValue(Sum.getNode(), 1)); 10889 10890 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 10891} 10892 10893SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 10894 SelectionDAG &DAG) const { 10895 DebugLoc dl = Op.getDebugLoc(); 10896 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 10897 EVT VT = Op.getValueType(); 10898 10899 if (!Subtarget->hasSSE2() || !VT.isVector()) 10900 return SDValue(); 10901 10902 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 10903 ExtraVT.getScalarType().getSizeInBits(); 10904 SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); 10905 10906 switch (VT.getSimpleVT().SimpleTy) { 10907 default: return SDValue(); 10908 case MVT::v8i32: 10909 case MVT::v16i16: 10910 if (!Subtarget->hasAVX()) 10911 return SDValue(); 10912 if (!Subtarget->hasAVX2()) { 10913 // needs to be split 10914 unsigned NumElems = VT.getVectorNumElements(); 10915 10916 // Extract the LHS vectors 10917 SDValue LHS = Op.getOperand(0); 10918 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 10919 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 10920 10921 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10922 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 10923 10924 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 10925 unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); 10926 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, 10927 ExtraNumElems/2); 10928 SDValue Extra = DAG.getValueType(ExtraVT); 10929 10930 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); 10931 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); 10932 10933 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; 10934 } 10935 // fall through 10936 case MVT::v4i32: 10937 case MVT::v8i16: { 10938 SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, 10939 Op.getOperand(0), ShAmt, DAG); 10940 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); 10941 } 10942 } 10943} 10944 10945 10946SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 10947 DebugLoc dl = Op.getDebugLoc(); 10948 10949 // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. 10950 // There isn't any reason to disable it if the target processor supports it. 10951 if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { 10952 SDValue Chain = Op.getOperand(0); 10953 SDValue Zero = DAG.getConstant(0, MVT::i32); 10954 SDValue Ops[] = { 10955 DAG.getRegister(X86::ESP, MVT::i32), // Base 10956 DAG.getTargetConstant(1, MVT::i8), // Scale 10957 DAG.getRegister(0, MVT::i32), // Index 10958 DAG.getTargetConstant(0, MVT::i32), // Disp 10959 DAG.getRegister(0, MVT::i32), // Segment. 10960 Zero, 10961 Chain 10962 }; 10963 SDNode *Res = 10964 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 10965 array_lengthof(Ops)); 10966 return SDValue(Res, 0); 10967 } 10968 10969 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 10970 if (!isDev) 10971 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 10972 10973 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10974 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 10975 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 10976 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 10977 10978 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 10979 if (!Op1 && !Op2 && !Op3 && Op4) 10980 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 10981 10982 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 10983 if (Op1 && !Op2 && !Op3 && !Op4) 10984 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 10985 10986 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 10987 // (MFENCE)>; 10988 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 10989} 10990 10991SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, 10992 SelectionDAG &DAG) const { 10993 DebugLoc dl = Op.getDebugLoc(); 10994 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 10995 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 10996 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 10997 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 10998 10999 // The only fence that needs an instruction is a sequentially-consistent 11000 // cross-thread fence. 11001 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 11002 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 11003 // no-sse2). There isn't any reason to disable it if the target processor 11004 // supports it. 11005 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 11006 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 11007 11008 SDValue Chain = Op.getOperand(0); 11009 SDValue Zero = DAG.getConstant(0, MVT::i32); 11010 SDValue Ops[] = { 11011 DAG.getRegister(X86::ESP, MVT::i32), // Base 11012 DAG.getTargetConstant(1, MVT::i8), // Scale 11013 DAG.getRegister(0, MVT::i32), // Index 11014 DAG.getTargetConstant(0, MVT::i32), // Disp 11015 DAG.getRegister(0, MVT::i32), // Segment. 11016 Zero, 11017 Chain 11018 }; 11019 SDNode *Res = 11020 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 11021 array_lengthof(Ops)); 11022 return SDValue(Res, 0); 11023 } 11024 11025 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 11026 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 11027} 11028 11029 11030SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 11031 EVT T = Op.getValueType(); 11032 DebugLoc DL = Op.getDebugLoc(); 11033 unsigned Reg = 0; 11034 unsigned size = 0; 11035 switch(T.getSimpleVT().SimpleTy) { 11036 default: llvm_unreachable("Invalid value type!"); 11037 case MVT::i8: Reg = X86::AL; size = 1; break; 11038 case MVT::i16: Reg = X86::AX; size = 2; break; 11039 case MVT::i32: Reg = X86::EAX; size = 4; break; 11040 case MVT::i64: 11041 assert(Subtarget->is64Bit() && "Node not type legal!"); 11042 Reg = X86::RAX; size = 8; 11043 break; 11044 } 11045 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 11046 Op.getOperand(2), SDValue()); 11047 SDValue Ops[] = { cpIn.getValue(0), 11048 Op.getOperand(1), 11049 Op.getOperand(3), 11050 DAG.getTargetConstant(size, MVT::i8), 11051 cpIn.getValue(1) }; 11052 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11053 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 11054 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 11055 Ops, 5, T, MMO); 11056 SDValue cpOut = 11057 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 11058 return cpOut; 11059} 11060 11061SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 11062 SelectionDAG &DAG) const { 11063 assert(Subtarget->is64Bit() && "Result not type legalized?"); 11064 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11065 SDValue TheChain = Op.getOperand(0); 11066 DebugLoc dl = Op.getDebugLoc(); 11067 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 11068 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 11069 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 11070 rax.getValue(2)); 11071 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 11072 DAG.getConstant(32, MVT::i8)); 11073 SDValue Ops[] = { 11074 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 11075 rdx.getValue(1) 11076 }; 11077 return DAG.getMergeValues(Ops, 2, dl); 11078} 11079 11080SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 11081 SelectionDAG &DAG) const { 11082 EVT SrcVT = Op.getOperand(0).getValueType(); 11083 EVT DstVT = Op.getValueType(); 11084 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 11085 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 11086 assert((DstVT == MVT::i64 || 11087 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 11088 "Unexpected custom BITCAST"); 11089 // i64 <=> MMX conversions are Legal. 11090 if (SrcVT==MVT::i64 && DstVT.isVector()) 11091 return Op; 11092 if (DstVT==MVT::i64 && SrcVT.isVector()) 11093 return Op; 11094 // MMX <=> MMX conversions are Legal. 11095 if (SrcVT.isVector() && DstVT.isVector()) 11096 return Op; 11097 // All other conversions need to be expanded. 11098 return SDValue(); 11099} 11100 11101SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 11102 SDNode *Node = Op.getNode(); 11103 DebugLoc dl = Node->getDebugLoc(); 11104 EVT T = Node->getValueType(0); 11105 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 11106 DAG.getConstant(0, T), Node->getOperand(2)); 11107 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 11108 cast<AtomicSDNode>(Node)->getMemoryVT(), 11109 Node->getOperand(0), 11110 Node->getOperand(1), negOp, 11111 cast<AtomicSDNode>(Node)->getSrcValue(), 11112 cast<AtomicSDNode>(Node)->getAlignment(), 11113 cast<AtomicSDNode>(Node)->getOrdering(), 11114 cast<AtomicSDNode>(Node)->getSynchScope()); 11115} 11116 11117static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 11118 SDNode *Node = Op.getNode(); 11119 DebugLoc dl = Node->getDebugLoc(); 11120 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 11121 11122 // Convert seq_cst store -> xchg 11123 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 11124 // FIXME: On 32-bit, store -> fist or movq would be more efficient 11125 // (The only way to get a 16-byte store is cmpxchg16b) 11126 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 11127 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 11128 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 11129 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 11130 cast<AtomicSDNode>(Node)->getMemoryVT(), 11131 Node->getOperand(0), 11132 Node->getOperand(1), Node->getOperand(2), 11133 cast<AtomicSDNode>(Node)->getMemOperand(), 11134 cast<AtomicSDNode>(Node)->getOrdering(), 11135 cast<AtomicSDNode>(Node)->getSynchScope()); 11136 return Swap.getValue(1); 11137 } 11138 // Other atomic stores have a simple pattern. 11139 return Op; 11140} 11141 11142static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 11143 EVT VT = Op.getNode()->getValueType(0); 11144 11145 // Let legalize expand this if it isn't a legal type yet. 11146 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 11147 return SDValue(); 11148 11149 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 11150 11151 unsigned Opc; 11152 bool ExtraOp = false; 11153 switch (Op.getOpcode()) { 11154 default: llvm_unreachable("Invalid code"); 11155 case ISD::ADDC: Opc = X86ISD::ADD; break; 11156 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 11157 case ISD::SUBC: Opc = X86ISD::SUB; break; 11158 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 11159 } 11160 11161 if (!ExtraOp) 11162 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 11163 Op.getOperand(1)); 11164 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 11165 Op.getOperand(1), Op.getOperand(2)); 11166} 11167 11168/// LowerOperation - Provide custom lowering hooks for some operations. 11169/// 11170SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 11171 switch (Op.getOpcode()) { 11172 default: llvm_unreachable("Should not custom lower this!"); 11173 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 11174 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 11175 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); 11176 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 11177 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 11178 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 11179 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 11180 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 11181 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 11182 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 11183 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 11184 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 11185 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 11186 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 11187 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 11188 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 11189 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 11190 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 11191 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 11192 case ISD::SHL_PARTS: 11193 case ISD::SRA_PARTS: 11194 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 11195 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 11196 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 11197 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 11198 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 11199 case ISD::FABS: return LowerFABS(Op, DAG); 11200 case ISD::FNEG: return LowerFNEG(Op, DAG); 11201 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 11202 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 11203 case ISD::SETCC: return LowerSETCC(Op, DAG); 11204 case ISD::SELECT: return LowerSELECT(Op, DAG); 11205 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 11206 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 11207 case ISD::VASTART: return LowerVASTART(Op, DAG); 11208 case ISD::VAARG: return LowerVAARG(Op, DAG); 11209 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 11210 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 11211 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); 11212 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 11213 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 11214 case ISD::FRAME_TO_ARGS_OFFSET: 11215 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 11216 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 11217 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 11218 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 11219 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 11220 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 11221 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 11222 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); 11223 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 11224 case ISD::MUL: return LowerMUL(Op, DAG); 11225 case ISD::SRA: 11226 case ISD::SRL: 11227 case ISD::SHL: return LowerShift(Op, DAG); 11228 case ISD::SADDO: 11229 case ISD::UADDO: 11230 case ISD::SSUBO: 11231 case ISD::USUBO: 11232 case ISD::SMULO: 11233 case ISD::UMULO: return LowerXALUO(Op, DAG); 11234 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 11235 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 11236 case ISD::ADDC: 11237 case ISD::ADDE: 11238 case ISD::SUBC: 11239 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 11240 case ISD::ADD: return LowerADD(Op, DAG); 11241 case ISD::SUB: return LowerSUB(Op, DAG); 11242 } 11243} 11244 11245static void ReplaceATOMIC_LOAD(SDNode *Node, 11246 SmallVectorImpl<SDValue> &Results, 11247 SelectionDAG &DAG) { 11248 DebugLoc dl = Node->getDebugLoc(); 11249 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 11250 11251 // Convert wide load -> cmpxchg8b/cmpxchg16b 11252 // FIXME: On 32-bit, load -> fild or movq would be more efficient 11253 // (The only way to get a 16-byte load is cmpxchg16b) 11254 // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. 11255 SDValue Zero = DAG.getConstant(0, VT); 11256 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, 11257 Node->getOperand(0), 11258 Node->getOperand(1), Zero, Zero, 11259 cast<AtomicSDNode>(Node)->getMemOperand(), 11260 cast<AtomicSDNode>(Node)->getOrdering(), 11261 cast<AtomicSDNode>(Node)->getSynchScope()); 11262 Results.push_back(Swap.getValue(0)); 11263 Results.push_back(Swap.getValue(1)); 11264} 11265 11266static void 11267ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 11268 SelectionDAG &DAG, unsigned NewOp) { 11269 DebugLoc dl = Node->getDebugLoc(); 11270 assert (Node->getValueType(0) == MVT::i64 && 11271 "Only know how to expand i64 atomics"); 11272 11273 SDValue Chain = Node->getOperand(0); 11274 SDValue In1 = Node->getOperand(1); 11275 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 11276 Node->getOperand(2), DAG.getIntPtrConstant(0)); 11277 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 11278 Node->getOperand(2), DAG.getIntPtrConstant(1)); 11279 SDValue Ops[] = { Chain, In1, In2L, In2H }; 11280 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 11281 SDValue Result = 11282 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 11283 cast<MemSDNode>(Node)->getMemOperand()); 11284 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 11285 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 11286 Results.push_back(Result.getValue(2)); 11287} 11288 11289/// ReplaceNodeResults - Replace a node with an illegal result type 11290/// with a new node built out of custom code. 11291void X86TargetLowering::ReplaceNodeResults(SDNode *N, 11292 SmallVectorImpl<SDValue>&Results, 11293 SelectionDAG &DAG) const { 11294 DebugLoc dl = N->getDebugLoc(); 11295 switch (N->getOpcode()) { 11296 default: 11297 llvm_unreachable("Do not know how to custom type legalize this operation!"); 11298 case ISD::SIGN_EXTEND_INREG: 11299 case ISD::ADDC: 11300 case ISD::ADDE: 11301 case ISD::SUBC: 11302 case ISD::SUBE: 11303 // We don't want to expand or promote these. 11304 return; 11305 case ISD::FP_TO_SINT: 11306 case ISD::FP_TO_UINT: { 11307 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 11308 11309 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) 11310 return; 11311 11312 std::pair<SDValue,SDValue> Vals = 11313 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 11314 SDValue FIST = Vals.first, StackSlot = Vals.second; 11315 if (FIST.getNode() != 0) { 11316 EVT VT = N->getValueType(0); 11317 // Return a load from the stack slot. 11318 if (StackSlot.getNode() != 0) 11319 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 11320 MachinePointerInfo(), 11321 false, false, false, 0)); 11322 else 11323 Results.push_back(FIST); 11324 } 11325 return; 11326 } 11327 case ISD::READCYCLECOUNTER: { 11328 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11329 SDValue TheChain = N->getOperand(0); 11330 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 11331 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 11332 rd.getValue(1)); 11333 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 11334 eax.getValue(2)); 11335 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 11336 SDValue Ops[] = { eax, edx }; 11337 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 11338 Results.push_back(edx.getValue(1)); 11339 return; 11340 } 11341 case ISD::ATOMIC_CMP_SWAP: { 11342 EVT T = N->getValueType(0); 11343 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 11344 bool Regs64bit = T == MVT::i128; 11345 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 11346 SDValue cpInL, cpInH; 11347 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 11348 DAG.getConstant(0, HalfT)); 11349 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 11350 DAG.getConstant(1, HalfT)); 11351 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 11352 Regs64bit ? X86::RAX : X86::EAX, 11353 cpInL, SDValue()); 11354 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 11355 Regs64bit ? X86::RDX : X86::EDX, 11356 cpInH, cpInL.getValue(1)); 11357 SDValue swapInL, swapInH; 11358 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 11359 DAG.getConstant(0, HalfT)); 11360 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 11361 DAG.getConstant(1, HalfT)); 11362 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 11363 Regs64bit ? X86::RBX : X86::EBX, 11364 swapInL, cpInH.getValue(1)); 11365 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 11366 Regs64bit ? X86::RCX : X86::ECX, 11367 swapInH, swapInL.getValue(1)); 11368 SDValue Ops[] = { swapInH.getValue(0), 11369 N->getOperand(1), 11370 swapInH.getValue(1) }; 11371 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 11372 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 11373 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 11374 X86ISD::LCMPXCHG8_DAG; 11375 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, 11376 Ops, 3, T, MMO); 11377 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 11378 Regs64bit ? X86::RAX : X86::EAX, 11379 HalfT, Result.getValue(1)); 11380 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 11381 Regs64bit ? X86::RDX : X86::EDX, 11382 HalfT, cpOutL.getValue(2)); 11383 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 11384 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); 11385 Results.push_back(cpOutH.getValue(1)); 11386 return; 11387 } 11388 case ISD::ATOMIC_LOAD_ADD: 11389 case ISD::ATOMIC_LOAD_AND: 11390 case ISD::ATOMIC_LOAD_NAND: 11391 case ISD::ATOMIC_LOAD_OR: 11392 case ISD::ATOMIC_LOAD_SUB: 11393 case ISD::ATOMIC_LOAD_XOR: 11394 case ISD::ATOMIC_SWAP: { 11395 unsigned Opc; 11396 switch (N->getOpcode()) { 11397 default: llvm_unreachable("Unexpected opcode"); 11398 case ISD::ATOMIC_LOAD_ADD: 11399 Opc = X86ISD::ATOMADD64_DAG; 11400 break; 11401 case ISD::ATOMIC_LOAD_AND: 11402 Opc = X86ISD::ATOMAND64_DAG; 11403 break; 11404 case ISD::ATOMIC_LOAD_NAND: 11405 Opc = X86ISD::ATOMNAND64_DAG; 11406 break; 11407 case ISD::ATOMIC_LOAD_OR: 11408 Opc = X86ISD::ATOMOR64_DAG; 11409 break; 11410 case ISD::ATOMIC_LOAD_SUB: 11411 Opc = X86ISD::ATOMSUB64_DAG; 11412 break; 11413 case ISD::ATOMIC_LOAD_XOR: 11414 Opc = X86ISD::ATOMXOR64_DAG; 11415 break; 11416 case ISD::ATOMIC_SWAP: 11417 Opc = X86ISD::ATOMSWAP64_DAG; 11418 break; 11419 } 11420 ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); 11421 return; 11422 } 11423 case ISD::ATOMIC_LOAD: 11424 ReplaceATOMIC_LOAD(N, Results, DAG); 11425 } 11426} 11427 11428const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 11429 switch (Opcode) { 11430 default: return NULL; 11431 case X86ISD::BSF: return "X86ISD::BSF"; 11432 case X86ISD::BSR: return "X86ISD::BSR"; 11433 case X86ISD::SHLD: return "X86ISD::SHLD"; 11434 case X86ISD::SHRD: return "X86ISD::SHRD"; 11435 case X86ISD::FAND: return "X86ISD::FAND"; 11436 case X86ISD::FOR: return "X86ISD::FOR"; 11437 case X86ISD::FXOR: return "X86ISD::FXOR"; 11438 case X86ISD::FSRL: return "X86ISD::FSRL"; 11439 case X86ISD::FILD: return "X86ISD::FILD"; 11440 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 11441 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 11442 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 11443 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 11444 case X86ISD::FLD: return "X86ISD::FLD"; 11445 case X86ISD::FST: return "X86ISD::FST"; 11446 case X86ISD::CALL: return "X86ISD::CALL"; 11447 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 11448 case X86ISD::BT: return "X86ISD::BT"; 11449 case X86ISD::CMP: return "X86ISD::CMP"; 11450 case X86ISD::COMI: return "X86ISD::COMI"; 11451 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 11452 case X86ISD::SETCC: return "X86ISD::SETCC"; 11453 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 11454 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 11455 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 11456 case X86ISD::CMOV: return "X86ISD::CMOV"; 11457 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 11458 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 11459 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 11460 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 11461 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 11462 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 11463 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 11464 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 11465 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 11466 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 11467 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 11468 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 11469 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 11470 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 11471 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 11472 case X86ISD::BLENDV: return "X86ISD::BLENDV"; 11473 case X86ISD::BLENDPW: return "X86ISD::BLENDPW"; 11474 case X86ISD::BLENDPS: return "X86ISD::BLENDPS"; 11475 case X86ISD::BLENDPD: return "X86ISD::BLENDPD"; 11476 case X86ISD::HADD: return "X86ISD::HADD"; 11477 case X86ISD::HSUB: return "X86ISD::HSUB"; 11478 case X86ISD::FHADD: return "X86ISD::FHADD"; 11479 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 11480 case X86ISD::FMAX: return "X86ISD::FMAX"; 11481 case X86ISD::FMIN: return "X86ISD::FMIN"; 11482 case X86ISD::FMAXC: return "X86ISD::FMAXC"; 11483 case X86ISD::FMINC: return "X86ISD::FMINC"; 11484 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 11485 case X86ISD::FRCP: return "X86ISD::FRCP"; 11486 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 11487 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; 11488 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 11489 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 11490 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 11491 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 11492 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; 11493 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 11494 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 11495 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 11496 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 11497 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 11498 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 11499 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 11500 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 11501 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 11502 case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; 11503 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 11504 case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; 11505 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 11506 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 11507 case X86ISD::VSHL: return "X86ISD::VSHL"; 11508 case X86ISD::VSRL: return "X86ISD::VSRL"; 11509 case X86ISD::VSRA: return "X86ISD::VSRA"; 11510 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 11511 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 11512 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 11513 case X86ISD::CMPP: return "X86ISD::CMPP"; 11514 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 11515 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 11516 case X86ISD::ADD: return "X86ISD::ADD"; 11517 case X86ISD::SUB: return "X86ISD::SUB"; 11518 case X86ISD::ADC: return "X86ISD::ADC"; 11519 case X86ISD::SBB: return "X86ISD::SBB"; 11520 case X86ISD::SMUL: return "X86ISD::SMUL"; 11521 case X86ISD::UMUL: return "X86ISD::UMUL"; 11522 case X86ISD::INC: return "X86ISD::INC"; 11523 case X86ISD::DEC: return "X86ISD::DEC"; 11524 case X86ISD::OR: return "X86ISD::OR"; 11525 case X86ISD::XOR: return "X86ISD::XOR"; 11526 case X86ISD::AND: return "X86ISD::AND"; 11527 case X86ISD::ANDN: return "X86ISD::ANDN"; 11528 case X86ISD::BLSI: return "X86ISD::BLSI"; 11529 case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; 11530 case X86ISD::BLSR: return "X86ISD::BLSR"; 11531 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 11532 case X86ISD::PTEST: return "X86ISD::PTEST"; 11533 case X86ISD::TESTP: return "X86ISD::TESTP"; 11534 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 11535 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 11536 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 11537 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 11538 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 11539 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 11540 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 11541 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 11542 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 11543 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 11544 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 11545 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 11546 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 11547 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 11548 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 11549 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 11550 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 11551 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 11552 case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; 11553 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 11554 case X86ISD::VPERMV: return "X86ISD::VPERMV"; 11555 case X86ISD::VPERMI: return "X86ISD::VPERMI"; 11556 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 11557 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 11558 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 11559 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 11560 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 11561 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 11562 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; 11563 case X86ISD::SAHF: return "X86ISD::SAHF"; 11564 case X86ISD::RDRAND: return "X86ISD::RDRAND"; 11565 case X86ISD::FMADD: return "X86ISD::FMADD"; 11566 case X86ISD::FMSUB: return "X86ISD::FMSUB"; 11567 case X86ISD::FNMADD: return "X86ISD::FNMADD"; 11568 case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; 11569 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; 11570 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; 11571 } 11572} 11573 11574// isLegalAddressingMode - Return true if the addressing mode represented 11575// by AM is legal for this target, for a load/store of the specified type. 11576bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 11577 Type *Ty) const { 11578 // X86 supports extremely general addressing modes. 11579 CodeModel::Model M = getTargetMachine().getCodeModel(); 11580 Reloc::Model R = getTargetMachine().getRelocationModel(); 11581 11582 // X86 allows a sign-extended 32-bit immediate field as a displacement. 11583 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 11584 return false; 11585 11586 if (AM.BaseGV) { 11587 unsigned GVFlags = 11588 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 11589 11590 // If a reference to this global requires an extra load, we can't fold it. 11591 if (isGlobalStubReference(GVFlags)) 11592 return false; 11593 11594 // If BaseGV requires a register for the PIC base, we cannot also have a 11595 // BaseReg specified. 11596 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 11597 return false; 11598 11599 // If lower 4G is not available, then we must use rip-relative addressing. 11600 if ((M != CodeModel::Small || R != Reloc::Static) && 11601 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 11602 return false; 11603 } 11604 11605 switch (AM.Scale) { 11606 case 0: 11607 case 1: 11608 case 2: 11609 case 4: 11610 case 8: 11611 // These scales always work. 11612 break; 11613 case 3: 11614 case 5: 11615 case 9: 11616 // These scales are formed with basereg+scalereg. Only accept if there is 11617 // no basereg yet. 11618 if (AM.HasBaseReg) 11619 return false; 11620 break; 11621 default: // Other stuff never works. 11622 return false; 11623 } 11624 11625 return true; 11626} 11627 11628 11629bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 11630 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 11631 return false; 11632 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 11633 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 11634 if (NumBits1 <= NumBits2) 11635 return false; 11636 return true; 11637} 11638 11639bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { 11640 return Imm == (int32_t)Imm; 11641} 11642 11643bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { 11644 // Can also use sub to handle negated immediates. 11645 return Imm == (int32_t)Imm; 11646} 11647 11648bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 11649 if (!VT1.isInteger() || !VT2.isInteger()) 11650 return false; 11651 unsigned NumBits1 = VT1.getSizeInBits(); 11652 unsigned NumBits2 = VT2.getSizeInBits(); 11653 if (NumBits1 <= NumBits2) 11654 return false; 11655 return true; 11656} 11657 11658bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 11659 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 11660 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 11661} 11662 11663bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 11664 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 11665 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 11666} 11667 11668bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 11669 // i16 instructions are longer (0x66 prefix) and potentially slower. 11670 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 11671} 11672 11673/// isShuffleMaskLegal - Targets can use this to indicate that they only 11674/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 11675/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 11676/// are assumed to be legal. 11677bool 11678X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 11679 EVT VT) const { 11680 // Very little shuffling can be done for 64-bit vectors right now. 11681 if (VT.getSizeInBits() == 64) 11682 return false; 11683 11684 // FIXME: pshufb, blends, shifts. 11685 return (VT.getVectorNumElements() == 2 || 11686 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 11687 isMOVLMask(M, VT) || 11688 isSHUFPMask(M, VT, Subtarget->hasAVX()) || 11689 isPSHUFDMask(M, VT) || 11690 isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) || 11691 isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) || 11692 isPALIGNRMask(M, VT, Subtarget) || 11693 isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || 11694 isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || 11695 isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) || 11696 isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2())); 11697} 11698 11699bool 11700X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 11701 EVT VT) const { 11702 unsigned NumElts = VT.getVectorNumElements(); 11703 // FIXME: This collection of masks seems suspect. 11704 if (NumElts == 2) 11705 return true; 11706 if (NumElts == 4 && VT.is128BitVector()) { 11707 return (isMOVLMask(Mask, VT) || 11708 isCommutedMOVLMask(Mask, VT, true) || 11709 isSHUFPMask(Mask, VT, Subtarget->hasAVX()) || 11710 isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true)); 11711 } 11712 return false; 11713} 11714 11715//===----------------------------------------------------------------------===// 11716// X86 Scheduler Hooks 11717//===----------------------------------------------------------------------===// 11718 11719// private utility function 11720MachineBasicBlock * 11721X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 11722 MachineBasicBlock *MBB, 11723 unsigned regOpc, 11724 unsigned immOpc, 11725 unsigned LoadOpc, 11726 unsigned CXchgOpc, 11727 unsigned notOpc, 11728 unsigned EAXreg, 11729 const TargetRegisterClass *RC, 11730 bool Invert) const { 11731 // For the atomic bitwise operator, we generate 11732 // thisMBB: 11733 // newMBB: 11734 // ld t1 = [bitinstr.addr] 11735 // op t2 = t1, [bitinstr.val] 11736 // not t3 = t2 (if Invert) 11737 // mov EAX = t1 11738 // lcs dest = [bitinstr.addr], t3 [EAX is implicit] 11739 // bz newMBB 11740 // fallthrough -->nextMBB 11741 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11742 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11743 MachineFunction::iterator MBBIter = MBB; 11744 ++MBBIter; 11745 11746 /// First build the CFG 11747 MachineFunction *F = MBB->getParent(); 11748 MachineBasicBlock *thisMBB = MBB; 11749 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 11750 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 11751 F->insert(MBBIter, newMBB); 11752 F->insert(MBBIter, nextMBB); 11753 11754 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 11755 nextMBB->splice(nextMBB->begin(), thisMBB, 11756 llvm::next(MachineBasicBlock::iterator(bInstr)), 11757 thisMBB->end()); 11758 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11759 11760 // Update thisMBB to fall through to newMBB 11761 thisMBB->addSuccessor(newMBB); 11762 11763 // newMBB jumps to itself and fall through to nextMBB 11764 newMBB->addSuccessor(nextMBB); 11765 newMBB->addSuccessor(newMBB); 11766 11767 // Insert instructions into newMBB based on incoming instruction 11768 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 11769 "unexpected number of operands"); 11770 DebugLoc dl = bInstr->getDebugLoc(); 11771 MachineOperand& destOper = bInstr->getOperand(0); 11772 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 11773 int numArgs = bInstr->getNumOperands() - 1; 11774 for (int i=0; i < numArgs; ++i) 11775 argOpers[i] = &bInstr->getOperand(i+1); 11776 11777 // x86 address has 4 operands: base, index, scale, and displacement 11778 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 11779 int valArgIndx = lastAddrIndx + 1; 11780 11781 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 11782 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 11783 for (int i=0; i <= lastAddrIndx; ++i) 11784 (*MIB).addOperand(*argOpers[i]); 11785 11786 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 11787 assert((argOpers[valArgIndx]->isReg() || 11788 argOpers[valArgIndx]->isImm()) && 11789 "invalid operand"); 11790 if (argOpers[valArgIndx]->isReg()) 11791 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 11792 else 11793 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 11794 MIB.addReg(t1); 11795 (*MIB).addOperand(*argOpers[valArgIndx]); 11796 11797 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 11798 if (Invert) { 11799 MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2); 11800 } 11801 else 11802 t3 = t2; 11803 11804 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 11805 MIB.addReg(t1); 11806 11807 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 11808 for (int i=0; i <= lastAddrIndx; ++i) 11809 (*MIB).addOperand(*argOpers[i]); 11810 MIB.addReg(t3); 11811 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 11812 (*MIB).setMemRefs(bInstr->memoperands_begin(), 11813 bInstr->memoperands_end()); 11814 11815 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 11816 MIB.addReg(EAXreg); 11817 11818 // insert branch 11819 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 11820 11821 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 11822 return nextMBB; 11823} 11824 11825// private utility function: 64 bit atomics on 32 bit host. 11826MachineBasicBlock * 11827X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 11828 MachineBasicBlock *MBB, 11829 unsigned regOpcL, 11830 unsigned regOpcH, 11831 unsigned immOpcL, 11832 unsigned immOpcH, 11833 bool Invert) const { 11834 // For the atomic bitwise operator, we generate 11835 // thisMBB (instructions are in pairs, except cmpxchg8b) 11836 // ld t1,t2 = [bitinstr.addr] 11837 // newMBB: 11838 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 11839 // op t5, t6 <- out1, out2, [bitinstr.val] 11840 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 11841 // neg t7, t8 < t5, t6 (if Invert) 11842 // mov ECX, EBX <- t5, t6 11843 // mov EAX, EDX <- t1, t2 11844 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 11845 // mov t3, t4 <- EAX, EDX 11846 // bz newMBB 11847 // result in out1, out2 11848 // fallthrough -->nextMBB 11849 11850 const TargetRegisterClass *RC = &X86::GR32RegClass; 11851 const unsigned LoadOpc = X86::MOV32rm; 11852 const unsigned NotOpc = X86::NOT32r; 11853 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11854 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11855 MachineFunction::iterator MBBIter = MBB; 11856 ++MBBIter; 11857 11858 /// First build the CFG 11859 MachineFunction *F = MBB->getParent(); 11860 MachineBasicBlock *thisMBB = MBB; 11861 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 11862 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 11863 F->insert(MBBIter, newMBB); 11864 F->insert(MBBIter, nextMBB); 11865 11866 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 11867 nextMBB->splice(nextMBB->begin(), thisMBB, 11868 llvm::next(MachineBasicBlock::iterator(bInstr)), 11869 thisMBB->end()); 11870 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11871 11872 // Update thisMBB to fall through to newMBB 11873 thisMBB->addSuccessor(newMBB); 11874 11875 // newMBB jumps to itself and fall through to nextMBB 11876 newMBB->addSuccessor(nextMBB); 11877 newMBB->addSuccessor(newMBB); 11878 11879 DebugLoc dl = bInstr->getDebugLoc(); 11880 // Insert instructions into newMBB based on incoming instruction 11881 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 11882 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 11883 "unexpected number of operands"); 11884 MachineOperand& dest1Oper = bInstr->getOperand(0); 11885 MachineOperand& dest2Oper = bInstr->getOperand(1); 11886 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 11887 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 11888 argOpers[i] = &bInstr->getOperand(i+2); 11889 11890 // We use some of the operands multiple times, so conservatively just 11891 // clear any kill flags that might be present. 11892 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 11893 argOpers[i]->setIsKill(false); 11894 } 11895 11896 // x86 address has 5 operands: base, index, scale, displacement, and segment. 11897 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 11898 11899 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 11900 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 11901 for (int i=0; i <= lastAddrIndx; ++i) 11902 (*MIB).addOperand(*argOpers[i]); 11903 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 11904 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 11905 // add 4 to displacement. 11906 for (int i=0; i <= lastAddrIndx-2; ++i) 11907 (*MIB).addOperand(*argOpers[i]); 11908 MachineOperand newOp3 = *(argOpers[3]); 11909 if (newOp3.isImm()) 11910 newOp3.setImm(newOp3.getImm()+4); 11911 else 11912 newOp3.setOffset(newOp3.getOffset()+4); 11913 (*MIB).addOperand(newOp3); 11914 (*MIB).addOperand(*argOpers[lastAddrIndx]); 11915 11916 // t3/4 are defined later, at the bottom of the loop 11917 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 11918 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 11919 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 11920 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 11921 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 11922 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 11923 11924 // The subsequent operations should be using the destination registers of 11925 // the PHI instructions. 11926 t1 = dest1Oper.getReg(); 11927 t2 = dest2Oper.getReg(); 11928 11929 int valArgIndx = lastAddrIndx + 1; 11930 assert((argOpers[valArgIndx]->isReg() || 11931 argOpers[valArgIndx]->isImm()) && 11932 "invalid operand"); 11933 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 11934 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 11935 if (argOpers[valArgIndx]->isReg()) 11936 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 11937 else 11938 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 11939 if (regOpcL != X86::MOV32rr) 11940 MIB.addReg(t1); 11941 (*MIB).addOperand(*argOpers[valArgIndx]); 11942 assert(argOpers[valArgIndx + 1]->isReg() == 11943 argOpers[valArgIndx]->isReg()); 11944 assert(argOpers[valArgIndx + 1]->isImm() == 11945 argOpers[valArgIndx]->isImm()); 11946 if (argOpers[valArgIndx + 1]->isReg()) 11947 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 11948 else 11949 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 11950 if (regOpcH != X86::MOV32rr) 11951 MIB.addReg(t2); 11952 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 11953 11954 unsigned t7, t8; 11955 if (Invert) { 11956 t7 = F->getRegInfo().createVirtualRegister(RC); 11957 t8 = F->getRegInfo().createVirtualRegister(RC); 11958 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5); 11959 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6); 11960 } else { 11961 t7 = t5; 11962 t8 = t6; 11963 } 11964 11965 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 11966 MIB.addReg(t1); 11967 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 11968 MIB.addReg(t2); 11969 11970 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 11971 MIB.addReg(t7); 11972 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 11973 MIB.addReg(t8); 11974 11975 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 11976 for (int i=0; i <= lastAddrIndx; ++i) 11977 (*MIB).addOperand(*argOpers[i]); 11978 11979 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 11980 (*MIB).setMemRefs(bInstr->memoperands_begin(), 11981 bInstr->memoperands_end()); 11982 11983 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 11984 MIB.addReg(X86::EAX); 11985 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 11986 MIB.addReg(X86::EDX); 11987 11988 // insert branch 11989 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 11990 11991 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 11992 return nextMBB; 11993} 11994 11995// private utility function 11996MachineBasicBlock * 11997X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 11998 MachineBasicBlock *MBB, 11999 unsigned cmovOpc) const { 12000 // For the atomic min/max operator, we generate 12001 // thisMBB: 12002 // newMBB: 12003 // ld t1 = [min/max.addr] 12004 // mov t2 = [min/max.val] 12005 // cmp t1, t2 12006 // cmov[cond] t2 = t1 12007 // mov EAX = t1 12008 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 12009 // bz newMBB 12010 // fallthrough -->nextMBB 12011 // 12012 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12013 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 12014 MachineFunction::iterator MBBIter = MBB; 12015 ++MBBIter; 12016 12017 /// First build the CFG 12018 MachineFunction *F = MBB->getParent(); 12019 MachineBasicBlock *thisMBB = MBB; 12020 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 12021 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 12022 F->insert(MBBIter, newMBB); 12023 F->insert(MBBIter, nextMBB); 12024 12025 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 12026 nextMBB->splice(nextMBB->begin(), thisMBB, 12027 llvm::next(MachineBasicBlock::iterator(mInstr)), 12028 thisMBB->end()); 12029 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 12030 12031 // Update thisMBB to fall through to newMBB 12032 thisMBB->addSuccessor(newMBB); 12033 12034 // newMBB jumps to newMBB and fall through to nextMBB 12035 newMBB->addSuccessor(nextMBB); 12036 newMBB->addSuccessor(newMBB); 12037 12038 DebugLoc dl = mInstr->getDebugLoc(); 12039 // Insert instructions into newMBB based on incoming instruction 12040 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 12041 "unexpected number of operands"); 12042 MachineOperand& destOper = mInstr->getOperand(0); 12043 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 12044 int numArgs = mInstr->getNumOperands() - 1; 12045 for (int i=0; i < numArgs; ++i) 12046 argOpers[i] = &mInstr->getOperand(i+1); 12047 12048 // x86 address has 4 operands: base, index, scale, and displacement 12049 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 12050 int valArgIndx = lastAddrIndx + 1; 12051 12052 unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); 12053 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 12054 for (int i=0; i <= lastAddrIndx; ++i) 12055 (*MIB).addOperand(*argOpers[i]); 12056 12057 // We only support register and immediate values 12058 assert((argOpers[valArgIndx]->isReg() || 12059 argOpers[valArgIndx]->isImm()) && 12060 "invalid operand"); 12061 12062 unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); 12063 if (argOpers[valArgIndx]->isReg()) 12064 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 12065 else 12066 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 12067 (*MIB).addOperand(*argOpers[valArgIndx]); 12068 12069 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 12070 MIB.addReg(t1); 12071 12072 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 12073 MIB.addReg(t1); 12074 MIB.addReg(t2); 12075 12076 // Generate movc 12077 unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); 12078 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 12079 MIB.addReg(t2); 12080 MIB.addReg(t1); 12081 12082 // Cmp and exchange if none has modified the memory location 12083 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 12084 for (int i=0; i <= lastAddrIndx; ++i) 12085 (*MIB).addOperand(*argOpers[i]); 12086 MIB.addReg(t3); 12087 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 12088 (*MIB).setMemRefs(mInstr->memoperands_begin(), 12089 mInstr->memoperands_end()); 12090 12091 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 12092 MIB.addReg(X86::EAX); 12093 12094 // insert branch 12095 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 12096 12097 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 12098 return nextMBB; 12099} 12100 12101// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 12102// or XMM0_V32I8 in AVX all of this code can be replaced with that 12103// in the .td file. 12104MachineBasicBlock * 12105X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 12106 unsigned numArgs, bool memArg) const { 12107 assert(Subtarget->hasSSE42() && 12108 "Target must have SSE4.2 or AVX features enabled"); 12109 12110 DebugLoc dl = MI->getDebugLoc(); 12111 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12112 unsigned Opc; 12113 if (!Subtarget->hasAVX()) { 12114 if (memArg) 12115 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 12116 else 12117 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 12118 } else { 12119 if (memArg) 12120 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 12121 else 12122 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 12123 } 12124 12125 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 12126 for (unsigned i = 0; i < numArgs; ++i) { 12127 MachineOperand &Op = MI->getOperand(i+1); 12128 if (!(Op.isReg() && Op.isImplicit())) 12129 MIB.addOperand(Op); 12130 } 12131 BuildMI(*BB, MI, dl, 12132 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 12133 .addReg(X86::XMM0); 12134 12135 MI->eraseFromParent(); 12136 return BB; 12137} 12138 12139MachineBasicBlock * 12140X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 12141 DebugLoc dl = MI->getDebugLoc(); 12142 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12143 12144 // Address into RAX/EAX, other two args into ECX, EDX. 12145 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 12146 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 12147 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 12148 for (int i = 0; i < X86::AddrNumOperands; ++i) 12149 MIB.addOperand(MI->getOperand(i)); 12150 12151 unsigned ValOps = X86::AddrNumOperands; 12152 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 12153 .addReg(MI->getOperand(ValOps).getReg()); 12154 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 12155 .addReg(MI->getOperand(ValOps+1).getReg()); 12156 12157 // The instruction doesn't actually take any operands though. 12158 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 12159 12160 MI->eraseFromParent(); // The pseudo is gone now. 12161 return BB; 12162} 12163 12164MachineBasicBlock * 12165X86TargetLowering::EmitVAARG64WithCustomInserter( 12166 MachineInstr *MI, 12167 MachineBasicBlock *MBB) const { 12168 // Emit va_arg instruction on X86-64. 12169 12170 // Operands to this pseudo-instruction: 12171 // 0 ) Output : destination address (reg) 12172 // 1-5) Input : va_list address (addr, i64mem) 12173 // 6 ) ArgSize : Size (in bytes) of vararg type 12174 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 12175 // 8 ) Align : Alignment of type 12176 // 9 ) EFLAGS (implicit-def) 12177 12178 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 12179 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 12180 12181 unsigned DestReg = MI->getOperand(0).getReg(); 12182 MachineOperand &Base = MI->getOperand(1); 12183 MachineOperand &Scale = MI->getOperand(2); 12184 MachineOperand &Index = MI->getOperand(3); 12185 MachineOperand &Disp = MI->getOperand(4); 12186 MachineOperand &Segment = MI->getOperand(5); 12187 unsigned ArgSize = MI->getOperand(6).getImm(); 12188 unsigned ArgMode = MI->getOperand(7).getImm(); 12189 unsigned Align = MI->getOperand(8).getImm(); 12190 12191 // Memory Reference 12192 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 12193 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 12194 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 12195 12196 // Machine Information 12197 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12198 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 12199 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 12200 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 12201 DebugLoc DL = MI->getDebugLoc(); 12202 12203 // struct va_list { 12204 // i32 gp_offset 12205 // i32 fp_offset 12206 // i64 overflow_area (address) 12207 // i64 reg_save_area (address) 12208 // } 12209 // sizeof(va_list) = 24 12210 // alignment(va_list) = 8 12211 12212 unsigned TotalNumIntRegs = 6; 12213 unsigned TotalNumXMMRegs = 8; 12214 bool UseGPOffset = (ArgMode == 1); 12215 bool UseFPOffset = (ArgMode == 2); 12216 unsigned MaxOffset = TotalNumIntRegs * 8 + 12217 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 12218 12219 /* Align ArgSize to a multiple of 8 */ 12220 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 12221 bool NeedsAlign = (Align > 8); 12222 12223 MachineBasicBlock *thisMBB = MBB; 12224 MachineBasicBlock *overflowMBB; 12225 MachineBasicBlock *offsetMBB; 12226 MachineBasicBlock *endMBB; 12227 12228 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 12229 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 12230 unsigned OffsetReg = 0; 12231 12232 if (!UseGPOffset && !UseFPOffset) { 12233 // If we only pull from the overflow region, we don't create a branch. 12234 // We don't need to alter control flow. 12235 OffsetDestReg = 0; // unused 12236 OverflowDestReg = DestReg; 12237 12238 offsetMBB = NULL; 12239 overflowMBB = thisMBB; 12240 endMBB = thisMBB; 12241 } else { 12242 // First emit code to check if gp_offset (or fp_offset) is below the bound. 12243 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 12244 // If not, pull from overflow_area. (branch to overflowMBB) 12245 // 12246 // thisMBB 12247 // | . 12248 // | . 12249 // offsetMBB overflowMBB 12250 // | . 12251 // | . 12252 // endMBB 12253 12254 // Registers for the PHI in endMBB 12255 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 12256 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 12257 12258 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 12259 MachineFunction *MF = MBB->getParent(); 12260 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12261 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12262 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12263 12264 MachineFunction::iterator MBBIter = MBB; 12265 ++MBBIter; 12266 12267 // Insert the new basic blocks 12268 MF->insert(MBBIter, offsetMBB); 12269 MF->insert(MBBIter, overflowMBB); 12270 MF->insert(MBBIter, endMBB); 12271 12272 // Transfer the remainder of MBB and its successor edges to endMBB. 12273 endMBB->splice(endMBB->begin(), thisMBB, 12274 llvm::next(MachineBasicBlock::iterator(MI)), 12275 thisMBB->end()); 12276 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 12277 12278 // Make offsetMBB and overflowMBB successors of thisMBB 12279 thisMBB->addSuccessor(offsetMBB); 12280 thisMBB->addSuccessor(overflowMBB); 12281 12282 // endMBB is a successor of both offsetMBB and overflowMBB 12283 offsetMBB->addSuccessor(endMBB); 12284 overflowMBB->addSuccessor(endMBB); 12285 12286 // Load the offset value into a register 12287 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 12288 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 12289 .addOperand(Base) 12290 .addOperand(Scale) 12291 .addOperand(Index) 12292 .addDisp(Disp, UseFPOffset ? 4 : 0) 12293 .addOperand(Segment) 12294 .setMemRefs(MMOBegin, MMOEnd); 12295 12296 // Check if there is enough room left to pull this argument. 12297 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 12298 .addReg(OffsetReg) 12299 .addImm(MaxOffset + 8 - ArgSizeA8); 12300 12301 // Branch to "overflowMBB" if offset >= max 12302 // Fall through to "offsetMBB" otherwise 12303 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 12304 .addMBB(overflowMBB); 12305 } 12306 12307 // In offsetMBB, emit code to use the reg_save_area. 12308 if (offsetMBB) { 12309 assert(OffsetReg != 0); 12310 12311 // Read the reg_save_area address. 12312 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 12313 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 12314 .addOperand(Base) 12315 .addOperand(Scale) 12316 .addOperand(Index) 12317 .addDisp(Disp, 16) 12318 .addOperand(Segment) 12319 .setMemRefs(MMOBegin, MMOEnd); 12320 12321 // Zero-extend the offset 12322 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 12323 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 12324 .addImm(0) 12325 .addReg(OffsetReg) 12326 .addImm(X86::sub_32bit); 12327 12328 // Add the offset to the reg_save_area to get the final address. 12329 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 12330 .addReg(OffsetReg64) 12331 .addReg(RegSaveReg); 12332 12333 // Compute the offset for the next argument 12334 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 12335 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 12336 .addReg(OffsetReg) 12337 .addImm(UseFPOffset ? 16 : 8); 12338 12339 // Store it back into the va_list. 12340 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 12341 .addOperand(Base) 12342 .addOperand(Scale) 12343 .addOperand(Index) 12344 .addDisp(Disp, UseFPOffset ? 4 : 0) 12345 .addOperand(Segment) 12346 .addReg(NextOffsetReg) 12347 .setMemRefs(MMOBegin, MMOEnd); 12348 12349 // Jump to endMBB 12350 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 12351 .addMBB(endMBB); 12352 } 12353 12354 // 12355 // Emit code to use overflow area 12356 // 12357 12358 // Load the overflow_area address into a register. 12359 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 12360 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 12361 .addOperand(Base) 12362 .addOperand(Scale) 12363 .addOperand(Index) 12364 .addDisp(Disp, 8) 12365 .addOperand(Segment) 12366 .setMemRefs(MMOBegin, MMOEnd); 12367 12368 // If we need to align it, do so. Otherwise, just copy the address 12369 // to OverflowDestReg. 12370 if (NeedsAlign) { 12371 // Align the overflow address 12372 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 12373 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 12374 12375 // aligned_addr = (addr + (align-1)) & ~(align-1) 12376 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 12377 .addReg(OverflowAddrReg) 12378 .addImm(Align-1); 12379 12380 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 12381 .addReg(TmpReg) 12382 .addImm(~(uint64_t)(Align-1)); 12383 } else { 12384 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 12385 .addReg(OverflowAddrReg); 12386 } 12387 12388 // Compute the next overflow address after this argument. 12389 // (the overflow address should be kept 8-byte aligned) 12390 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 12391 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 12392 .addReg(OverflowDestReg) 12393 .addImm(ArgSizeA8); 12394 12395 // Store the new overflow address. 12396 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 12397 .addOperand(Base) 12398 .addOperand(Scale) 12399 .addOperand(Index) 12400 .addDisp(Disp, 8) 12401 .addOperand(Segment) 12402 .addReg(NextAddrReg) 12403 .setMemRefs(MMOBegin, MMOEnd); 12404 12405 // If we branched, emit the PHI to the front of endMBB. 12406 if (offsetMBB) { 12407 BuildMI(*endMBB, endMBB->begin(), DL, 12408 TII->get(X86::PHI), DestReg) 12409 .addReg(OffsetDestReg).addMBB(offsetMBB) 12410 .addReg(OverflowDestReg).addMBB(overflowMBB); 12411 } 12412 12413 // Erase the pseudo instruction 12414 MI->eraseFromParent(); 12415 12416 return endMBB; 12417} 12418 12419MachineBasicBlock * 12420X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 12421 MachineInstr *MI, 12422 MachineBasicBlock *MBB) const { 12423 // Emit code to save XMM registers to the stack. The ABI says that the 12424 // number of registers to save is given in %al, so it's theoretically 12425 // possible to do an indirect jump trick to avoid saving all of them, 12426 // however this code takes a simpler approach and just executes all 12427 // of the stores if %al is non-zero. It's less code, and it's probably 12428 // easier on the hardware branch predictor, and stores aren't all that 12429 // expensive anyway. 12430 12431 // Create the new basic blocks. One block contains all the XMM stores, 12432 // and one block is the final destination regardless of whether any 12433 // stores were performed. 12434 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 12435 MachineFunction *F = MBB->getParent(); 12436 MachineFunction::iterator MBBIter = MBB; 12437 ++MBBIter; 12438 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 12439 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 12440 F->insert(MBBIter, XMMSaveMBB); 12441 F->insert(MBBIter, EndMBB); 12442 12443 // Transfer the remainder of MBB and its successor edges to EndMBB. 12444 EndMBB->splice(EndMBB->begin(), MBB, 12445 llvm::next(MachineBasicBlock::iterator(MI)), 12446 MBB->end()); 12447 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 12448 12449 // The original block will now fall through to the XMM save block. 12450 MBB->addSuccessor(XMMSaveMBB); 12451 // The XMMSaveMBB will fall through to the end block. 12452 XMMSaveMBB->addSuccessor(EndMBB); 12453 12454 // Now add the instructions. 12455 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12456 DebugLoc DL = MI->getDebugLoc(); 12457 12458 unsigned CountReg = MI->getOperand(0).getReg(); 12459 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 12460 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 12461 12462 if (!Subtarget->isTargetWin64()) { 12463 // If %al is 0, branch around the XMM save block. 12464 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 12465 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 12466 MBB->addSuccessor(EndMBB); 12467 } 12468 12469 unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; 12470 // In the XMM save block, save all the XMM argument registers. 12471 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 12472 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 12473 MachineMemOperand *MMO = 12474 F->getMachineMemOperand( 12475 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 12476 MachineMemOperand::MOStore, 12477 /*Size=*/16, /*Align=*/16); 12478 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 12479 .addFrameIndex(RegSaveFrameIndex) 12480 .addImm(/*Scale=*/1) 12481 .addReg(/*IndexReg=*/0) 12482 .addImm(/*Disp=*/Offset) 12483 .addReg(/*Segment=*/0) 12484 .addReg(MI->getOperand(i).getReg()) 12485 .addMemOperand(MMO); 12486 } 12487 12488 MI->eraseFromParent(); // The pseudo instruction is gone now. 12489 12490 return EndMBB; 12491} 12492 12493// The EFLAGS operand of SelectItr might be missing a kill marker 12494// because there were multiple uses of EFLAGS, and ISel didn't know 12495// which to mark. Figure out whether SelectItr should have had a 12496// kill marker, and set it if it should. Returns the correct kill 12497// marker value. 12498static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 12499 MachineBasicBlock* BB, 12500 const TargetRegisterInfo* TRI) { 12501 // Scan forward through BB for a use/def of EFLAGS. 12502 MachineBasicBlock::iterator miI(llvm::next(SelectItr)); 12503 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 12504 const MachineInstr& mi = *miI; 12505 if (mi.readsRegister(X86::EFLAGS)) 12506 return false; 12507 if (mi.definesRegister(X86::EFLAGS)) 12508 break; // Should have kill-flag - update below. 12509 } 12510 12511 // If we hit the end of the block, check whether EFLAGS is live into a 12512 // successor. 12513 if (miI == BB->end()) { 12514 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 12515 sEnd = BB->succ_end(); 12516 sItr != sEnd; ++sItr) { 12517 MachineBasicBlock* succ = *sItr; 12518 if (succ->isLiveIn(X86::EFLAGS)) 12519 return false; 12520 } 12521 } 12522 12523 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 12524 // out. SelectMI should have a kill flag on EFLAGS. 12525 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 12526 return true; 12527} 12528 12529MachineBasicBlock * 12530X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 12531 MachineBasicBlock *BB) const { 12532 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12533 DebugLoc DL = MI->getDebugLoc(); 12534 12535 // To "insert" a SELECT_CC instruction, we actually have to insert the 12536 // diamond control-flow pattern. The incoming instruction knows the 12537 // destination vreg to set, the condition code register to branch on, the 12538 // true/false values to select between, and a branch opcode to use. 12539 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 12540 MachineFunction::iterator It = BB; 12541 ++It; 12542 12543 // thisMBB: 12544 // ... 12545 // TrueVal = ... 12546 // cmpTY ccX, r1, r2 12547 // bCC copy1MBB 12548 // fallthrough --> copy0MBB 12549 MachineBasicBlock *thisMBB = BB; 12550 MachineFunction *F = BB->getParent(); 12551 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 12552 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 12553 F->insert(It, copy0MBB); 12554 F->insert(It, sinkMBB); 12555 12556 // If the EFLAGS register isn't dead in the terminator, then claim that it's 12557 // live into the sink and copy blocks. 12558 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 12559 if (!MI->killsRegister(X86::EFLAGS) && 12560 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { 12561 copy0MBB->addLiveIn(X86::EFLAGS); 12562 sinkMBB->addLiveIn(X86::EFLAGS); 12563 } 12564 12565 // Transfer the remainder of BB and its successor edges to sinkMBB. 12566 sinkMBB->splice(sinkMBB->begin(), BB, 12567 llvm::next(MachineBasicBlock::iterator(MI)), 12568 BB->end()); 12569 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 12570 12571 // Add the true and fallthrough blocks as its successors. 12572 BB->addSuccessor(copy0MBB); 12573 BB->addSuccessor(sinkMBB); 12574 12575 // Create the conditional branch instruction. 12576 unsigned Opc = 12577 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 12578 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 12579 12580 // copy0MBB: 12581 // %FalseValue = ... 12582 // # fallthrough to sinkMBB 12583 copy0MBB->addSuccessor(sinkMBB); 12584 12585 // sinkMBB: 12586 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 12587 // ... 12588 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 12589 TII->get(X86::PHI), MI->getOperand(0).getReg()) 12590 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 12591 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 12592 12593 MI->eraseFromParent(); // The pseudo instruction is gone now. 12594 return sinkMBB; 12595} 12596 12597MachineBasicBlock * 12598X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, 12599 bool Is64Bit) const { 12600 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12601 DebugLoc DL = MI->getDebugLoc(); 12602 MachineFunction *MF = BB->getParent(); 12603 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 12604 12605 assert(getTargetMachine().Options.EnableSegmentedStacks); 12606 12607 unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 12608 unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; 12609 12610 // BB: 12611 // ... [Till the alloca] 12612 // If stacklet is not large enough, jump to mallocMBB 12613 // 12614 // bumpMBB: 12615 // Allocate by subtracting from RSP 12616 // Jump to continueMBB 12617 // 12618 // mallocMBB: 12619 // Allocate by call to runtime 12620 // 12621 // continueMBB: 12622 // ... 12623 // [rest of original BB] 12624 // 12625 12626 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12627 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12628 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12629 12630 MachineRegisterInfo &MRI = MF->getRegInfo(); 12631 const TargetRegisterClass *AddrRegClass = 12632 getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); 12633 12634 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 12635 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 12636 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 12637 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 12638 sizeVReg = MI->getOperand(1).getReg(), 12639 physSPReg = Is64Bit ? X86::RSP : X86::ESP; 12640 12641 MachineFunction::iterator MBBIter = BB; 12642 ++MBBIter; 12643 12644 MF->insert(MBBIter, bumpMBB); 12645 MF->insert(MBBIter, mallocMBB); 12646 MF->insert(MBBIter, continueMBB); 12647 12648 continueMBB->splice(continueMBB->begin(), BB, llvm::next 12649 (MachineBasicBlock::iterator(MI)), BB->end()); 12650 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 12651 12652 // Add code to the main basic block to check if the stack limit has been hit, 12653 // and if so, jump to mallocMBB otherwise to bumpMBB. 12654 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 12655 BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 12656 .addReg(tmpSPVReg).addReg(sizeVReg); 12657 BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) 12658 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 12659 .addReg(SPLimitVReg); 12660 BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); 12661 12662 // bumpMBB simply decreases the stack pointer, since we know the current 12663 // stacklet has enough space. 12664 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 12665 .addReg(SPLimitVReg); 12666 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 12667 .addReg(SPLimitVReg); 12668 BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 12669 12670 // Calls into a routine in libgcc to allocate more space from the heap. 12671 const uint32_t *RegMask = 12672 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 12673 if (Is64Bit) { 12674 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 12675 .addReg(sizeVReg); 12676 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 12677 .addExternalSymbol("__morestack_allocate_stack_space") 12678 .addRegMask(RegMask) 12679 .addReg(X86::RDI, RegState::Implicit) 12680 .addReg(X86::RAX, RegState::ImplicitDefine); 12681 } else { 12682 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 12683 .addImm(12); 12684 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 12685 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 12686 .addExternalSymbol("__morestack_allocate_stack_space") 12687 .addRegMask(RegMask) 12688 .addReg(X86::EAX, RegState::ImplicitDefine); 12689 } 12690 12691 if (!Is64Bit) 12692 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 12693 .addImm(16); 12694 12695 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 12696 .addReg(Is64Bit ? X86::RAX : X86::EAX); 12697 BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 12698 12699 // Set up the CFG correctly. 12700 BB->addSuccessor(bumpMBB); 12701 BB->addSuccessor(mallocMBB); 12702 mallocMBB->addSuccessor(continueMBB); 12703 bumpMBB->addSuccessor(continueMBB); 12704 12705 // Take care of the PHI nodes. 12706 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 12707 MI->getOperand(0).getReg()) 12708 .addReg(mallocPtrVReg).addMBB(mallocMBB) 12709 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 12710 12711 // Delete the original pseudo instruction. 12712 MI->eraseFromParent(); 12713 12714 // And we're done. 12715 return continueMBB; 12716} 12717 12718MachineBasicBlock * 12719X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 12720 MachineBasicBlock *BB) const { 12721 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12722 DebugLoc DL = MI->getDebugLoc(); 12723 12724 assert(!Subtarget->isTargetEnvMacho()); 12725 12726 // The lowering is pretty easy: we're just emitting the call to _alloca. The 12727 // non-trivial part is impdef of ESP. 12728 12729 if (Subtarget->isTargetWin64()) { 12730 if (Subtarget->isTargetCygMing()) { 12731 // ___chkstk(Mingw64): 12732 // Clobbers R10, R11, RAX and EFLAGS. 12733 // Updates RSP. 12734 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 12735 .addExternalSymbol("___chkstk") 12736 .addReg(X86::RAX, RegState::Implicit) 12737 .addReg(X86::RSP, RegState::Implicit) 12738 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 12739 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 12740 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 12741 } else { 12742 // __chkstk(MSVCRT): does not update stack pointer. 12743 // Clobbers R10, R11 and EFLAGS. 12744 // FIXME: RAX(allocated size) might be reused and not killed. 12745 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 12746 .addExternalSymbol("__chkstk") 12747 .addReg(X86::RAX, RegState::Implicit) 12748 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 12749 // RAX has the offset to subtracted from RSP. 12750 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 12751 .addReg(X86::RSP) 12752 .addReg(X86::RAX); 12753 } 12754 } else { 12755 const char *StackProbeSymbol = 12756 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 12757 12758 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 12759 .addExternalSymbol(StackProbeSymbol) 12760 .addReg(X86::EAX, RegState::Implicit) 12761 .addReg(X86::ESP, RegState::Implicit) 12762 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 12763 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 12764 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 12765 } 12766 12767 MI->eraseFromParent(); // The pseudo instruction is gone now. 12768 return BB; 12769} 12770 12771MachineBasicBlock * 12772X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 12773 MachineBasicBlock *BB) const { 12774 // This is pretty easy. We're taking the value that we received from 12775 // our load from the relocation, sticking it in either RDI (x86-64) 12776 // or EAX and doing an indirect call. The return value will then 12777 // be in the normal return register. 12778 const X86InstrInfo *TII 12779 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 12780 DebugLoc DL = MI->getDebugLoc(); 12781 MachineFunction *F = BB->getParent(); 12782 12783 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 12784 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 12785 12786 // Get a register mask for the lowered call. 12787 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 12788 // proper register mask. 12789 const uint32_t *RegMask = 12790 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 12791 if (Subtarget->is64Bit()) { 12792 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 12793 TII->get(X86::MOV64rm), X86::RDI) 12794 .addReg(X86::RIP) 12795 .addImm(0).addReg(0) 12796 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 12797 MI->getOperand(3).getTargetFlags()) 12798 .addReg(0); 12799 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 12800 addDirectMem(MIB, X86::RDI); 12801 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 12802 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 12803 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 12804 TII->get(X86::MOV32rm), X86::EAX) 12805 .addReg(0) 12806 .addImm(0).addReg(0) 12807 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 12808 MI->getOperand(3).getTargetFlags()) 12809 .addReg(0); 12810 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 12811 addDirectMem(MIB, X86::EAX); 12812 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 12813 } else { 12814 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 12815 TII->get(X86::MOV32rm), X86::EAX) 12816 .addReg(TII->getGlobalBaseReg(F)) 12817 .addImm(0).addReg(0) 12818 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 12819 MI->getOperand(3).getTargetFlags()) 12820 .addReg(0); 12821 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 12822 addDirectMem(MIB, X86::EAX); 12823 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 12824 } 12825 12826 MI->eraseFromParent(); // The pseudo instruction is gone now. 12827 return BB; 12828} 12829 12830MachineBasicBlock * 12831X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 12832 MachineBasicBlock *BB) const { 12833 switch (MI->getOpcode()) { 12834 default: llvm_unreachable("Unexpected instr type to insert"); 12835 case X86::TAILJMPd64: 12836 case X86::TAILJMPr64: 12837 case X86::TAILJMPm64: 12838 llvm_unreachable("TAILJMP64 would not be touched here."); 12839 case X86::TCRETURNdi64: 12840 case X86::TCRETURNri64: 12841 case X86::TCRETURNmi64: 12842 return BB; 12843 case X86::WIN_ALLOCA: 12844 return EmitLoweredWinAlloca(MI, BB); 12845 case X86::SEG_ALLOCA_32: 12846 return EmitLoweredSegAlloca(MI, BB, false); 12847 case X86::SEG_ALLOCA_64: 12848 return EmitLoweredSegAlloca(MI, BB, true); 12849 case X86::TLSCall_32: 12850 case X86::TLSCall_64: 12851 return EmitLoweredTLSCall(MI, BB); 12852 case X86::CMOV_GR8: 12853 case X86::CMOV_FR32: 12854 case X86::CMOV_FR64: 12855 case X86::CMOV_V4F32: 12856 case X86::CMOV_V2F64: 12857 case X86::CMOV_V2I64: 12858 case X86::CMOV_V8F32: 12859 case X86::CMOV_V4F64: 12860 case X86::CMOV_V4I64: 12861 case X86::CMOV_GR16: 12862 case X86::CMOV_GR32: 12863 case X86::CMOV_RFP32: 12864 case X86::CMOV_RFP64: 12865 case X86::CMOV_RFP80: 12866 return EmitLoweredSelect(MI, BB); 12867 12868 case X86::FP32_TO_INT16_IN_MEM: 12869 case X86::FP32_TO_INT32_IN_MEM: 12870 case X86::FP32_TO_INT64_IN_MEM: 12871 case X86::FP64_TO_INT16_IN_MEM: 12872 case X86::FP64_TO_INT32_IN_MEM: 12873 case X86::FP64_TO_INT64_IN_MEM: 12874 case X86::FP80_TO_INT16_IN_MEM: 12875 case X86::FP80_TO_INT32_IN_MEM: 12876 case X86::FP80_TO_INT64_IN_MEM: { 12877 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12878 DebugLoc DL = MI->getDebugLoc(); 12879 12880 // Change the floating point control register to use "round towards zero" 12881 // mode when truncating to an integer value. 12882 MachineFunction *F = BB->getParent(); 12883 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 12884 addFrameReference(BuildMI(*BB, MI, DL, 12885 TII->get(X86::FNSTCW16m)), CWFrameIdx); 12886 12887 // Load the old value of the high byte of the control word... 12888 unsigned OldCW = 12889 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); 12890 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 12891 CWFrameIdx); 12892 12893 // Set the high part to be round to zero... 12894 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 12895 .addImm(0xC7F); 12896 12897 // Reload the modified control word now... 12898 addFrameReference(BuildMI(*BB, MI, DL, 12899 TII->get(X86::FLDCW16m)), CWFrameIdx); 12900 12901 // Restore the memory image of control word to original value 12902 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 12903 .addReg(OldCW); 12904 12905 // Get the X86 opcode to use. 12906 unsigned Opc; 12907 switch (MI->getOpcode()) { 12908 default: llvm_unreachable("illegal opcode!"); 12909 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 12910 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 12911 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 12912 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 12913 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 12914 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 12915 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 12916 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 12917 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 12918 } 12919 12920 X86AddressMode AM; 12921 MachineOperand &Op = MI->getOperand(0); 12922 if (Op.isReg()) { 12923 AM.BaseType = X86AddressMode::RegBase; 12924 AM.Base.Reg = Op.getReg(); 12925 } else { 12926 AM.BaseType = X86AddressMode::FrameIndexBase; 12927 AM.Base.FrameIndex = Op.getIndex(); 12928 } 12929 Op = MI->getOperand(1); 12930 if (Op.isImm()) 12931 AM.Scale = Op.getImm(); 12932 Op = MI->getOperand(2); 12933 if (Op.isImm()) 12934 AM.IndexReg = Op.getImm(); 12935 Op = MI->getOperand(3); 12936 if (Op.isGlobal()) { 12937 AM.GV = Op.getGlobal(); 12938 } else { 12939 AM.Disp = Op.getImm(); 12940 } 12941 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 12942 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 12943 12944 // Reload the original control word now. 12945 addFrameReference(BuildMI(*BB, MI, DL, 12946 TII->get(X86::FLDCW16m)), CWFrameIdx); 12947 12948 MI->eraseFromParent(); // The pseudo instruction is gone now. 12949 return BB; 12950 } 12951 // String/text processing lowering. 12952 case X86::PCMPISTRM128REG: 12953 case X86::VPCMPISTRM128REG: 12954 case X86::PCMPISTRM128MEM: 12955 case X86::VPCMPISTRM128MEM: 12956 case X86::PCMPESTRM128REG: 12957 case X86::VPCMPESTRM128REG: 12958 case X86::PCMPESTRM128MEM: 12959 case X86::VPCMPESTRM128MEM: { 12960 unsigned NumArgs; 12961 bool MemArg; 12962 switch (MI->getOpcode()) { 12963 default: llvm_unreachable("illegal opcode!"); 12964 case X86::PCMPISTRM128REG: 12965 case X86::VPCMPISTRM128REG: 12966 NumArgs = 3; MemArg = false; break; 12967 case X86::PCMPISTRM128MEM: 12968 case X86::VPCMPISTRM128MEM: 12969 NumArgs = 3; MemArg = true; break; 12970 case X86::PCMPESTRM128REG: 12971 case X86::VPCMPESTRM128REG: 12972 NumArgs = 5; MemArg = false; break; 12973 case X86::PCMPESTRM128MEM: 12974 case X86::VPCMPESTRM128MEM: 12975 NumArgs = 5; MemArg = true; break; 12976 } 12977 return EmitPCMP(MI, BB, NumArgs, MemArg); 12978 } 12979 12980 // Thread synchronization. 12981 case X86::MONITOR: 12982 return EmitMonitor(MI, BB); 12983 12984 // Atomic Lowering. 12985 case X86::ATOMMIN32: 12986 case X86::ATOMMAX32: 12987 case X86::ATOMUMIN32: 12988 case X86::ATOMUMAX32: 12989 case X86::ATOMMIN16: 12990 case X86::ATOMMAX16: 12991 case X86::ATOMUMIN16: 12992 case X86::ATOMUMAX16: 12993 case X86::ATOMMIN64: 12994 case X86::ATOMMAX64: 12995 case X86::ATOMUMIN64: 12996 case X86::ATOMUMAX64: { 12997 unsigned Opc; 12998 switch (MI->getOpcode()) { 12999 default: llvm_unreachable("illegal opcode!"); 13000 case X86::ATOMMIN32: Opc = X86::CMOVL32rr; break; 13001 case X86::ATOMMAX32: Opc = X86::CMOVG32rr; break; 13002 case X86::ATOMUMIN32: Opc = X86::CMOVB32rr; break; 13003 case X86::ATOMUMAX32: Opc = X86::CMOVA32rr; break; 13004 case X86::ATOMMIN16: Opc = X86::CMOVL16rr; break; 13005 case X86::ATOMMAX16: Opc = X86::CMOVG16rr; break; 13006 case X86::ATOMUMIN16: Opc = X86::CMOVB16rr; break; 13007 case X86::ATOMUMAX16: Opc = X86::CMOVA16rr; break; 13008 case X86::ATOMMIN64: Opc = X86::CMOVL64rr; break; 13009 case X86::ATOMMAX64: Opc = X86::CMOVG64rr; break; 13010 case X86::ATOMUMIN64: Opc = X86::CMOVB64rr; break; 13011 case X86::ATOMUMAX64: Opc = X86::CMOVA64rr; break; 13012 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 13013 } 13014 return EmitAtomicMinMaxWithCustomInserter(MI, BB, Opc); 13015 } 13016 13017 case X86::ATOMAND32: 13018 case X86::ATOMOR32: 13019 case X86::ATOMXOR32: 13020 case X86::ATOMNAND32: { 13021 bool Invert = false; 13022 unsigned RegOpc, ImmOpc; 13023 switch (MI->getOpcode()) { 13024 default: llvm_unreachable("illegal opcode!"); 13025 case X86::ATOMAND32: 13026 RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; break; 13027 case X86::ATOMOR32: 13028 RegOpc = X86::OR32rr; ImmOpc = X86::OR32ri; break; 13029 case X86::ATOMXOR32: 13030 RegOpc = X86::XOR32rr; ImmOpc = X86::XOR32ri; break; 13031 case X86::ATOMNAND32: 13032 RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; Invert = true; break; 13033 } 13034 return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc, 13035 X86::MOV32rm, X86::LCMPXCHG32, 13036 X86::NOT32r, X86::EAX, 13037 &X86::GR32RegClass, Invert); 13038 } 13039 13040 case X86::ATOMAND16: 13041 case X86::ATOMOR16: 13042 case X86::ATOMXOR16: 13043 case X86::ATOMNAND16: { 13044 bool Invert = false; 13045 unsigned RegOpc, ImmOpc; 13046 switch (MI->getOpcode()) { 13047 default: llvm_unreachable("illegal opcode!"); 13048 case X86::ATOMAND16: 13049 RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; break; 13050 case X86::ATOMOR16: 13051 RegOpc = X86::OR16rr; ImmOpc = X86::OR16ri; break; 13052 case X86::ATOMXOR16: 13053 RegOpc = X86::XOR16rr; ImmOpc = X86::XOR16ri; break; 13054 case X86::ATOMNAND16: 13055 RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; Invert = true; break; 13056 } 13057 return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc, 13058 X86::MOV16rm, X86::LCMPXCHG16, 13059 X86::NOT16r, X86::AX, 13060 &X86::GR16RegClass, Invert); 13061 } 13062 13063 case X86::ATOMAND8: 13064 case X86::ATOMOR8: 13065 case X86::ATOMXOR8: 13066 case X86::ATOMNAND8: { 13067 bool Invert = false; 13068 unsigned RegOpc, ImmOpc; 13069 switch (MI->getOpcode()) { 13070 default: llvm_unreachable("illegal opcode!"); 13071 case X86::ATOMAND8: 13072 RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; break; 13073 case X86::ATOMOR8: 13074 RegOpc = X86::OR8rr; ImmOpc = X86::OR8ri; break; 13075 case X86::ATOMXOR8: 13076 RegOpc = X86::XOR8rr; ImmOpc = X86::XOR8ri; break; 13077 case X86::ATOMNAND8: 13078 RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; Invert = true; break; 13079 } 13080 return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc, 13081 X86::MOV8rm, X86::LCMPXCHG8, 13082 X86::NOT8r, X86::AL, 13083 &X86::GR8RegClass, Invert); 13084 } 13085 13086 // This group is for 64-bit host. 13087 case X86::ATOMAND64: 13088 case X86::ATOMOR64: 13089 case X86::ATOMXOR64: 13090 case X86::ATOMNAND64: { 13091 bool Invert = false; 13092 unsigned RegOpc, ImmOpc; 13093 switch (MI->getOpcode()) { 13094 default: llvm_unreachable("illegal opcode!"); 13095 case X86::ATOMAND64: 13096 RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; break; 13097 case X86::ATOMOR64: 13098 RegOpc = X86::OR64rr; ImmOpc = X86::OR64ri32; break; 13099 case X86::ATOMXOR64: 13100 RegOpc = X86::XOR64rr; ImmOpc = X86::XOR64ri32; break; 13101 case X86::ATOMNAND64: 13102 RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; Invert = true; break; 13103 } 13104 return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc, 13105 X86::MOV64rm, X86::LCMPXCHG64, 13106 X86::NOT64r, X86::RAX, 13107 &X86::GR64RegClass, Invert); 13108 } 13109 13110 // This group does 64-bit operations on a 32-bit host. 13111 case X86::ATOMAND6432: 13112 case X86::ATOMOR6432: 13113 case X86::ATOMXOR6432: 13114 case X86::ATOMNAND6432: 13115 case X86::ATOMADD6432: 13116 case X86::ATOMSUB6432: 13117 case X86::ATOMSWAP6432: { 13118 bool Invert = false; 13119 unsigned RegOpcL, RegOpcH, ImmOpcL, ImmOpcH; 13120 switch (MI->getOpcode()) { 13121 default: llvm_unreachable("illegal opcode!"); 13122 case X86::ATOMAND6432: 13123 RegOpcL = RegOpcH = X86::AND32rr; 13124 ImmOpcL = ImmOpcH = X86::AND32ri; 13125 break; 13126 case X86::ATOMOR6432: 13127 RegOpcL = RegOpcH = X86::OR32rr; 13128 ImmOpcL = ImmOpcH = X86::OR32ri; 13129 break; 13130 case X86::ATOMXOR6432: 13131 RegOpcL = RegOpcH = X86::XOR32rr; 13132 ImmOpcL = ImmOpcH = X86::XOR32ri; 13133 break; 13134 case X86::ATOMNAND6432: 13135 RegOpcL = RegOpcH = X86::AND32rr; 13136 ImmOpcL = ImmOpcH = X86::AND32ri; 13137 Invert = true; 13138 break; 13139 case X86::ATOMADD6432: 13140 RegOpcL = X86::ADD32rr; RegOpcH = X86::ADC32rr; 13141 ImmOpcL = X86::ADD32ri; ImmOpcH = X86::ADC32ri; 13142 break; 13143 case X86::ATOMSUB6432: 13144 RegOpcL = X86::SUB32rr; RegOpcH = X86::SBB32rr; 13145 ImmOpcL = X86::SUB32ri; ImmOpcH = X86::SBB32ri; 13146 break; 13147 case X86::ATOMSWAP6432: 13148 RegOpcL = RegOpcH = X86::MOV32rr; 13149 ImmOpcL = ImmOpcH = X86::MOV32ri; 13150 break; 13151 } 13152 return EmitAtomicBit6432WithCustomInserter(MI, BB, RegOpcL, RegOpcH, 13153 ImmOpcL, ImmOpcH, Invert); 13154 } 13155 13156 case X86::VASTART_SAVE_XMM_REGS: 13157 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 13158 13159 case X86::VAARG_64: 13160 return EmitVAARG64WithCustomInserter(MI, BB); 13161 } 13162} 13163 13164//===----------------------------------------------------------------------===// 13165// X86 Optimization Hooks 13166//===----------------------------------------------------------------------===// 13167 13168void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 13169 APInt &KnownZero, 13170 APInt &KnownOne, 13171 const SelectionDAG &DAG, 13172 unsigned Depth) const { 13173 unsigned BitWidth = KnownZero.getBitWidth(); 13174 unsigned Opc = Op.getOpcode(); 13175 assert((Opc >= ISD::BUILTIN_OP_END || 13176 Opc == ISD::INTRINSIC_WO_CHAIN || 13177 Opc == ISD::INTRINSIC_W_CHAIN || 13178 Opc == ISD::INTRINSIC_VOID) && 13179 "Should use MaskedValueIsZero if you don't know whether Op" 13180 " is a target node!"); 13181 13182 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. 13183 switch (Opc) { 13184 default: break; 13185 case X86ISD::ADD: 13186 case X86ISD::SUB: 13187 case X86ISD::ADC: 13188 case X86ISD::SBB: 13189 case X86ISD::SMUL: 13190 case X86ISD::UMUL: 13191 case X86ISD::INC: 13192 case X86ISD::DEC: 13193 case X86ISD::OR: 13194 case X86ISD::XOR: 13195 case X86ISD::AND: 13196 // These nodes' second result is a boolean. 13197 if (Op.getResNo() == 0) 13198 break; 13199 // Fallthrough 13200 case X86ISD::SETCC: 13201 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 13202 break; 13203 case ISD::INTRINSIC_WO_CHAIN: { 13204 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13205 unsigned NumLoBits = 0; 13206 switch (IntId) { 13207 default: break; 13208 case Intrinsic::x86_sse_movmsk_ps: 13209 case Intrinsic::x86_avx_movmsk_ps_256: 13210 case Intrinsic::x86_sse2_movmsk_pd: 13211 case Intrinsic::x86_avx_movmsk_pd_256: 13212 case Intrinsic::x86_mmx_pmovmskb: 13213 case Intrinsic::x86_sse2_pmovmskb_128: 13214 case Intrinsic::x86_avx2_pmovmskb: { 13215 // High bits of movmskp{s|d}, pmovmskb are known zero. 13216 switch (IntId) { 13217 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 13218 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 13219 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 13220 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 13221 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 13222 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 13223 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 13224 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; 13225 } 13226 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); 13227 break; 13228 } 13229 } 13230 break; 13231 } 13232 } 13233} 13234 13235unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 13236 unsigned Depth) const { 13237 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 13238 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 13239 return Op.getValueType().getScalarType().getSizeInBits(); 13240 13241 // Fallback case. 13242 return 1; 13243} 13244 13245/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 13246/// node is a GlobalAddress + offset. 13247bool X86TargetLowering::isGAPlusOffset(SDNode *N, 13248 const GlobalValue* &GA, 13249 int64_t &Offset) const { 13250 if (N->getOpcode() == X86ISD::Wrapper) { 13251 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 13252 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 13253 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 13254 return true; 13255 } 13256 } 13257 return TargetLowering::isGAPlusOffset(N, GA, Offset); 13258} 13259 13260/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 13261/// same as extracting the high 128-bit part of 256-bit vector and then 13262/// inserting the result into the low part of a new 256-bit vector 13263static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 13264 EVT VT = SVOp->getValueType(0); 13265 unsigned NumElems = VT.getVectorNumElements(); 13266 13267 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 13268 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) 13269 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 13270 SVOp->getMaskElt(j) >= 0) 13271 return false; 13272 13273 return true; 13274} 13275 13276/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 13277/// same as extracting the low 128-bit part of 256-bit vector and then 13278/// inserting the result into the high part of a new 256-bit vector 13279static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 13280 EVT VT = SVOp->getValueType(0); 13281 unsigned NumElems = VT.getVectorNumElements(); 13282 13283 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 13284 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) 13285 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 13286 SVOp->getMaskElt(j) >= 0) 13287 return false; 13288 13289 return true; 13290} 13291 13292/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 13293static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 13294 TargetLowering::DAGCombinerInfo &DCI, 13295 const X86Subtarget* Subtarget) { 13296 DebugLoc dl = N->getDebugLoc(); 13297 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 13298 SDValue V1 = SVOp->getOperand(0); 13299 SDValue V2 = SVOp->getOperand(1); 13300 EVT VT = SVOp->getValueType(0); 13301 unsigned NumElems = VT.getVectorNumElements(); 13302 13303 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 13304 V2.getOpcode() == ISD::CONCAT_VECTORS) { 13305 // 13306 // 0,0,0,... 13307 // | 13308 // V UNDEF BUILD_VECTOR UNDEF 13309 // \ / \ / 13310 // CONCAT_VECTOR CONCAT_VECTOR 13311 // \ / 13312 // \ / 13313 // RESULT: V + zero extended 13314 // 13315 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 13316 V2.getOperand(1).getOpcode() != ISD::UNDEF || 13317 V1.getOperand(1).getOpcode() != ISD::UNDEF) 13318 return SDValue(); 13319 13320 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 13321 return SDValue(); 13322 13323 // To match the shuffle mask, the first half of the mask should 13324 // be exactly the first vector, and all the rest a splat with the 13325 // first element of the second one. 13326 for (unsigned i = 0; i != NumElems/2; ++i) 13327 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 13328 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 13329 return SDValue(); 13330 13331 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 13332 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 13333 if (Ld->hasNUsesOfValue(1, 0)) { 13334 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 13335 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 13336 SDValue ResNode = 13337 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, 13338 Ld->getMemoryVT(), 13339 Ld->getPointerInfo(), 13340 Ld->getAlignment(), 13341 false/*isVolatile*/, true/*ReadMem*/, 13342 false/*WriteMem*/); 13343 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); 13344 } 13345 } 13346 13347 // Emit a zeroed vector and insert the desired subvector on its 13348 // first half. 13349 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 13350 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); 13351 return DCI.CombineTo(N, InsV); 13352 } 13353 13354 //===--------------------------------------------------------------------===// 13355 // Combine some shuffles into subvector extracts and inserts: 13356 // 13357 13358 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 13359 if (isShuffleHigh128VectorInsertLow(SVOp)) { 13360 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); 13361 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); 13362 return DCI.CombineTo(N, InsV); 13363 } 13364 13365 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 13366 if (isShuffleLow128VectorInsertHigh(SVOp)) { 13367 SDValue V = Extract128BitVector(V1, 0, DAG, dl); 13368 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); 13369 return DCI.CombineTo(N, InsV); 13370 } 13371 13372 return SDValue(); 13373} 13374 13375/// PerformShuffleCombine - Performs several different shuffle combines. 13376static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 13377 TargetLowering::DAGCombinerInfo &DCI, 13378 const X86Subtarget *Subtarget) { 13379 DebugLoc dl = N->getDebugLoc(); 13380 EVT VT = N->getValueType(0); 13381 13382 // Don't create instructions with illegal types after legalize types has run. 13383 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13384 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 13385 return SDValue(); 13386 13387 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 13388 if (Subtarget->hasAVX() && VT.is256BitVector() && 13389 N->getOpcode() == ISD::VECTOR_SHUFFLE) 13390 return PerformShuffleCombine256(N, DAG, DCI, Subtarget); 13391 13392 // Only handle 128 wide vector from here on. 13393 if (!VT.is128BitVector()) 13394 return SDValue(); 13395 13396 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 13397 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 13398 // consecutive, non-overlapping, and in the right order. 13399 SmallVector<SDValue, 16> Elts; 13400 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 13401 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 13402 13403 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 13404} 13405 13406 13407/// DCI, PerformTruncateCombine - Converts truncate operation to 13408/// a sequence of vector shuffle operations. 13409/// It is possible when we truncate 256-bit vector to 128-bit vector 13410 13411SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 13412 DAGCombinerInfo &DCI) const { 13413 if (!DCI.isBeforeLegalizeOps()) 13414 return SDValue(); 13415 13416 if (!Subtarget->hasAVX()) 13417 return SDValue(); 13418 13419 EVT VT = N->getValueType(0); 13420 SDValue Op = N->getOperand(0); 13421 EVT OpVT = Op.getValueType(); 13422 DebugLoc dl = N->getDebugLoc(); 13423 13424 if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { 13425 13426 if (Subtarget->hasAVX2()) { 13427 // AVX2: v4i64 -> v4i32 13428 13429 // VPERMD 13430 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; 13431 13432 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op); 13433 Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32), 13434 ShufMask); 13435 13436 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, 13437 DAG.getIntPtrConstant(0)); 13438 } 13439 13440 // AVX: v4i64 -> v4i32 13441 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, 13442 DAG.getIntPtrConstant(0)); 13443 13444 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, 13445 DAG.getIntPtrConstant(2)); 13446 13447 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); 13448 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); 13449 13450 // PSHUFD 13451 static const int ShufMask1[] = {0, 2, 0, 0}; 13452 13453 SDValue Undef = DAG.getUNDEF(VT); 13454 OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1); 13455 OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1); 13456 13457 // MOVLHPS 13458 static const int ShufMask2[] = {0, 1, 4, 5}; 13459 13460 return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2); 13461 } 13462 13463 if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { 13464 13465 if (Subtarget->hasAVX2()) { 13466 // AVX2: v8i32 -> v8i16 13467 13468 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op); 13469 13470 // PSHUFB 13471 SmallVector<SDValue,32> pshufbMask; 13472 for (unsigned i = 0; i < 2; ++i) { 13473 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); 13474 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); 13475 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); 13476 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); 13477 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); 13478 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); 13479 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); 13480 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); 13481 for (unsigned j = 0; j < 8; ++j) 13482 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 13483 } 13484 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8, 13485 &pshufbMask[0], 32); 13486 Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV); 13487 13488 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op); 13489 13490 static const int ShufMask[] = {0, 2, -1, -1}; 13491 Op = DAG.getVectorShuffle(MVT::v4i64, dl, Op, DAG.getUNDEF(MVT::v4i64), 13492 &ShufMask[0]); 13493 13494 Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, 13495 DAG.getIntPtrConstant(0)); 13496 13497 return DAG.getNode(ISD::BITCAST, dl, VT, Op); 13498 } 13499 13500 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, 13501 DAG.getIntPtrConstant(0)); 13502 13503 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, 13504 DAG.getIntPtrConstant(4)); 13505 13506 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo); 13507 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi); 13508 13509 // PSHUFB 13510 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 13511 -1, -1, -1, -1, -1, -1, -1, -1}; 13512 13513 SDValue Undef = DAG.getUNDEF(MVT::v16i8); 13514 OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1); 13515 OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1); 13516 13517 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); 13518 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); 13519 13520 // MOVLHPS 13521 static const int ShufMask2[] = {0, 1, 4, 5}; 13522 13523 SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2); 13524 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res); 13525 } 13526 13527 return SDValue(); 13528} 13529 13530/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target 13531/// specific shuffle of a load can be folded into a single element load. 13532/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but 13533/// shuffles have been customed lowered so we need to handle those here. 13534static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, 13535 TargetLowering::DAGCombinerInfo &DCI) { 13536 if (DCI.isBeforeLegalizeOps()) 13537 return SDValue(); 13538 13539 SDValue InVec = N->getOperand(0); 13540 SDValue EltNo = N->getOperand(1); 13541 13542 if (!isa<ConstantSDNode>(EltNo)) 13543 return SDValue(); 13544 13545 EVT VT = InVec.getValueType(); 13546 13547 bool HasShuffleIntoBitcast = false; 13548 if (InVec.getOpcode() == ISD::BITCAST) { 13549 // Don't duplicate a load with other uses. 13550 if (!InVec.hasOneUse()) 13551 return SDValue(); 13552 EVT BCVT = InVec.getOperand(0).getValueType(); 13553 if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) 13554 return SDValue(); 13555 InVec = InVec.getOperand(0); 13556 HasShuffleIntoBitcast = true; 13557 } 13558 13559 if (!isTargetShuffle(InVec.getOpcode())) 13560 return SDValue(); 13561 13562 // Don't duplicate a load with other uses. 13563 if (!InVec.hasOneUse()) 13564 return SDValue(); 13565 13566 SmallVector<int, 16> ShuffleMask; 13567 bool UnaryShuffle; 13568 if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, 13569 UnaryShuffle)) 13570 return SDValue(); 13571 13572 // Select the input vector, guarding against out of range extract vector. 13573 unsigned NumElems = VT.getVectorNumElements(); 13574 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 13575 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; 13576 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) 13577 : InVec.getOperand(1); 13578 13579 // If inputs to shuffle are the same for both ops, then allow 2 uses 13580 unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; 13581 13582 if (LdNode.getOpcode() == ISD::BITCAST) { 13583 // Don't duplicate a load with other uses. 13584 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) 13585 return SDValue(); 13586 13587 AllowedUses = 1; // only allow 1 load use if we have a bitcast 13588 LdNode = LdNode.getOperand(0); 13589 } 13590 13591 if (!ISD::isNormalLoad(LdNode.getNode())) 13592 return SDValue(); 13593 13594 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); 13595 13596 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) 13597 return SDValue(); 13598 13599 if (HasShuffleIntoBitcast) { 13600 // If there's a bitcast before the shuffle, check if the load type and 13601 // alignment is valid. 13602 unsigned Align = LN0->getAlignment(); 13603 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13604 unsigned NewAlign = TLI.getTargetData()-> 13605 getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); 13606 13607 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 13608 return SDValue(); 13609 } 13610 13611 // All checks match so transform back to vector_shuffle so that DAG combiner 13612 // can finish the job 13613 DebugLoc dl = N->getDebugLoc(); 13614 13615 // Create shuffle node taking into account the case that its a unary shuffle 13616 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); 13617 Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, 13618 InVec.getOperand(0), Shuffle, 13619 &ShuffleMask[0]); 13620 Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 13621 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, 13622 EltNo); 13623} 13624 13625/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 13626/// generation and convert it from being a bunch of shuffles and extracts 13627/// to a simple store and scalar loads to extract the elements. 13628static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 13629 TargetLowering::DAGCombinerInfo &DCI) { 13630 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); 13631 if (NewOp.getNode()) 13632 return NewOp; 13633 13634 SDValue InputVector = N->getOperand(0); 13635 13636 // Only operate on vectors of 4 elements, where the alternative shuffling 13637 // gets to be more expensive. 13638 if (InputVector.getValueType() != MVT::v4i32) 13639 return SDValue(); 13640 13641 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 13642 // single use which is a sign-extend or zero-extend, and all elements are 13643 // used. 13644 SmallVector<SDNode *, 4> Uses; 13645 unsigned ExtractedElements = 0; 13646 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 13647 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 13648 if (UI.getUse().getResNo() != InputVector.getResNo()) 13649 return SDValue(); 13650 13651 SDNode *Extract = *UI; 13652 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 13653 return SDValue(); 13654 13655 if (Extract->getValueType(0) != MVT::i32) 13656 return SDValue(); 13657 if (!Extract->hasOneUse()) 13658 return SDValue(); 13659 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 13660 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 13661 return SDValue(); 13662 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 13663 return SDValue(); 13664 13665 // Record which element was extracted. 13666 ExtractedElements |= 13667 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 13668 13669 Uses.push_back(Extract); 13670 } 13671 13672 // If not all the elements were used, this may not be worthwhile. 13673 if (ExtractedElements != 15) 13674 return SDValue(); 13675 13676 // Ok, we've now decided to do the transformation. 13677 DebugLoc dl = InputVector.getDebugLoc(); 13678 13679 // Store the value to a temporary stack slot. 13680 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 13681 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 13682 MachinePointerInfo(), false, false, 0); 13683 13684 // Replace each use (extract) with a load of the appropriate element. 13685 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 13686 UE = Uses.end(); UI != UE; ++UI) { 13687 SDNode *Extract = *UI; 13688 13689 // cOMpute the element's address. 13690 SDValue Idx = Extract->getOperand(1); 13691 unsigned EltSize = 13692 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 13693 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 13694 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13695 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 13696 13697 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 13698 StackPtr, OffsetVal); 13699 13700 // Load the scalar. 13701 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 13702 ScalarAddr, MachinePointerInfo(), 13703 false, false, false, 0); 13704 13705 // Replace the exact with the load. 13706 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 13707 } 13708 13709 // The replacement was made in place; don't return anything. 13710 return SDValue(); 13711} 13712 13713/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 13714/// nodes. 13715static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 13716 TargetLowering::DAGCombinerInfo &DCI, 13717 const X86Subtarget *Subtarget) { 13718 DebugLoc DL = N->getDebugLoc(); 13719 SDValue Cond = N->getOperand(0); 13720 // Get the LHS/RHS of the select. 13721 SDValue LHS = N->getOperand(1); 13722 SDValue RHS = N->getOperand(2); 13723 EVT VT = LHS.getValueType(); 13724 13725 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 13726 // instructions match the semantics of the common C idiom x<y?x:y but not 13727 // x<=y?x:y, because of how they handle negative zero (which can be 13728 // ignored in unsafe-math mode). 13729 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 13730 VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 13731 (Subtarget->hasSSE2() || 13732 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 13733 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 13734 13735 unsigned Opcode = 0; 13736 // Check for x CC y ? x : y. 13737 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 13738 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 13739 switch (CC) { 13740 default: break; 13741 case ISD::SETULT: 13742 // Converting this to a min would handle NaNs incorrectly, and swapping 13743 // the operands would cause it to handle comparisons between positive 13744 // and negative zero incorrectly. 13745 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 13746 if (!DAG.getTarget().Options.UnsafeFPMath && 13747 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 13748 break; 13749 std::swap(LHS, RHS); 13750 } 13751 Opcode = X86ISD::FMIN; 13752 break; 13753 case ISD::SETOLE: 13754 // Converting this to a min would handle comparisons between positive 13755 // and negative zero incorrectly. 13756 if (!DAG.getTarget().Options.UnsafeFPMath && 13757 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 13758 break; 13759 Opcode = X86ISD::FMIN; 13760 break; 13761 case ISD::SETULE: 13762 // Converting this to a min would handle both negative zeros and NaNs 13763 // incorrectly, but we can swap the operands to fix both. 13764 std::swap(LHS, RHS); 13765 case ISD::SETOLT: 13766 case ISD::SETLT: 13767 case ISD::SETLE: 13768 Opcode = X86ISD::FMIN; 13769 break; 13770 13771 case ISD::SETOGE: 13772 // Converting this to a max would handle comparisons between positive 13773 // and negative zero incorrectly. 13774 if (!DAG.getTarget().Options.UnsafeFPMath && 13775 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 13776 break; 13777 Opcode = X86ISD::FMAX; 13778 break; 13779 case ISD::SETUGT: 13780 // Converting this to a max would handle NaNs incorrectly, and swapping 13781 // the operands would cause it to handle comparisons between positive 13782 // and negative zero incorrectly. 13783 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 13784 if (!DAG.getTarget().Options.UnsafeFPMath && 13785 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 13786 break; 13787 std::swap(LHS, RHS); 13788 } 13789 Opcode = X86ISD::FMAX; 13790 break; 13791 case ISD::SETUGE: 13792 // Converting this to a max would handle both negative zeros and NaNs 13793 // incorrectly, but we can swap the operands to fix both. 13794 std::swap(LHS, RHS); 13795 case ISD::SETOGT: 13796 case ISD::SETGT: 13797 case ISD::SETGE: 13798 Opcode = X86ISD::FMAX; 13799 break; 13800 } 13801 // Check for x CC y ? y : x -- a min/max with reversed arms. 13802 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 13803 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 13804 switch (CC) { 13805 default: break; 13806 case ISD::SETOGE: 13807 // Converting this to a min would handle comparisons between positive 13808 // and negative zero incorrectly, and swapping the operands would 13809 // cause it to handle NaNs incorrectly. 13810 if (!DAG.getTarget().Options.UnsafeFPMath && 13811 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 13812 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 13813 break; 13814 std::swap(LHS, RHS); 13815 } 13816 Opcode = X86ISD::FMIN; 13817 break; 13818 case ISD::SETUGT: 13819 // Converting this to a min would handle NaNs incorrectly. 13820 if (!DAG.getTarget().Options.UnsafeFPMath && 13821 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 13822 break; 13823 Opcode = X86ISD::FMIN; 13824 break; 13825 case ISD::SETUGE: 13826 // Converting this to a min would handle both negative zeros and NaNs 13827 // incorrectly, but we can swap the operands to fix both. 13828 std::swap(LHS, RHS); 13829 case ISD::SETOGT: 13830 case ISD::SETGT: 13831 case ISD::SETGE: 13832 Opcode = X86ISD::FMIN; 13833 break; 13834 13835 case ISD::SETULT: 13836 // Converting this to a max would handle NaNs incorrectly. 13837 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 13838 break; 13839 Opcode = X86ISD::FMAX; 13840 break; 13841 case ISD::SETOLE: 13842 // Converting this to a max would handle comparisons between positive 13843 // and negative zero incorrectly, and swapping the operands would 13844 // cause it to handle NaNs incorrectly. 13845 if (!DAG.getTarget().Options.UnsafeFPMath && 13846 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 13847 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 13848 break; 13849 std::swap(LHS, RHS); 13850 } 13851 Opcode = X86ISD::FMAX; 13852 break; 13853 case ISD::SETULE: 13854 // Converting this to a max would handle both negative zeros and NaNs 13855 // incorrectly, but we can swap the operands to fix both. 13856 std::swap(LHS, RHS); 13857 case ISD::SETOLT: 13858 case ISD::SETLT: 13859 case ISD::SETLE: 13860 Opcode = X86ISD::FMAX; 13861 break; 13862 } 13863 } 13864 13865 if (Opcode) 13866 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 13867 } 13868 13869 // If this is a select between two integer constants, try to do some 13870 // optimizations. 13871 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 13872 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 13873 // Don't do this for crazy integer types. 13874 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 13875 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 13876 // so that TrueC (the true value) is larger than FalseC. 13877 bool NeedsCondInvert = false; 13878 13879 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 13880 // Efficiently invertible. 13881 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 13882 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 13883 isa<ConstantSDNode>(Cond.getOperand(1))))) { 13884 NeedsCondInvert = true; 13885 std::swap(TrueC, FalseC); 13886 } 13887 13888 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 13889 if (FalseC->getAPIntValue() == 0 && 13890 TrueC->getAPIntValue().isPowerOf2()) { 13891 if (NeedsCondInvert) // Invert the condition if needed. 13892 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 13893 DAG.getConstant(1, Cond.getValueType())); 13894 13895 // Zero extend the condition if needed. 13896 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 13897 13898 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 13899 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 13900 DAG.getConstant(ShAmt, MVT::i8)); 13901 } 13902 13903 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 13904 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 13905 if (NeedsCondInvert) // Invert the condition if needed. 13906 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 13907 DAG.getConstant(1, Cond.getValueType())); 13908 13909 // Zero extend the condition if needed. 13910 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 13911 FalseC->getValueType(0), Cond); 13912 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 13913 SDValue(FalseC, 0)); 13914 } 13915 13916 // Optimize cases that will turn into an LEA instruction. This requires 13917 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 13918 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 13919 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 13920 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 13921 13922 bool isFastMultiplier = false; 13923 if (Diff < 10) { 13924 switch ((unsigned char)Diff) { 13925 default: break; 13926 case 1: // result = add base, cond 13927 case 2: // result = lea base( , cond*2) 13928 case 3: // result = lea base(cond, cond*2) 13929 case 4: // result = lea base( , cond*4) 13930 case 5: // result = lea base(cond, cond*4) 13931 case 8: // result = lea base( , cond*8) 13932 case 9: // result = lea base(cond, cond*8) 13933 isFastMultiplier = true; 13934 break; 13935 } 13936 } 13937 13938 if (isFastMultiplier) { 13939 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 13940 if (NeedsCondInvert) // Invert the condition if needed. 13941 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 13942 DAG.getConstant(1, Cond.getValueType())); 13943 13944 // Zero extend the condition if needed. 13945 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 13946 Cond); 13947 // Scale the condition by the difference. 13948 if (Diff != 1) 13949 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 13950 DAG.getConstant(Diff, Cond.getValueType())); 13951 13952 // Add the base if non-zero. 13953 if (FalseC->getAPIntValue() != 0) 13954 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 13955 SDValue(FalseC, 0)); 13956 return Cond; 13957 } 13958 } 13959 } 13960 } 13961 13962 // Canonicalize max and min: 13963 // (x > y) ? x : y -> (x >= y) ? x : y 13964 // (x < y) ? x : y -> (x <= y) ? x : y 13965 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 13966 // the need for an extra compare 13967 // against zero. e.g. 13968 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 13969 // subl %esi, %edi 13970 // testl %edi, %edi 13971 // movl $0, %eax 13972 // cmovgl %edi, %eax 13973 // => 13974 // xorl %eax, %eax 13975 // subl %esi, $edi 13976 // cmovsl %eax, %edi 13977 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 13978 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 13979 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 13980 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 13981 switch (CC) { 13982 default: break; 13983 case ISD::SETLT: 13984 case ISD::SETGT: { 13985 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 13986 Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(), 13987 Cond.getOperand(0), Cond.getOperand(1), NewCC); 13988 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 13989 } 13990 } 13991 } 13992 13993 // If we know that this node is legal then we know that it is going to be 13994 // matched by one of the SSE/AVX BLEND instructions. These instructions only 13995 // depend on the highest bit in each word. Try to use SimplifyDemandedBits 13996 // to simplify previous instructions. 13997 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13998 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 13999 !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { 14000 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); 14001 14002 // Don't optimize vector selects that map to mask-registers. 14003 if (BitWidth == 1) 14004 return SDValue(); 14005 14006 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 14007 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 14008 14009 APInt KnownZero, KnownOne; 14010 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 14011 DCI.isBeforeLegalizeOps()); 14012 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 14013 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) 14014 DCI.CommitTargetLoweringOpt(TLO); 14015 } 14016 14017 return SDValue(); 14018} 14019 14020// Check whether a boolean test is testing a boolean value generated by 14021// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition 14022// code. 14023// 14024// Simplify the following patterns: 14025// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or 14026// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) 14027// to (Op EFLAGS Cond) 14028// 14029// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or 14030// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) 14031// to (Op EFLAGS !Cond) 14032// 14033// where Op could be BRCOND or CMOV. 14034// 14035static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { 14036 // Quit if not CMP and SUB with its value result used. 14037 if (Cmp.getOpcode() != X86ISD::CMP && 14038 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) 14039 return SDValue(); 14040 14041 // Quit if not used as a boolean value. 14042 if (CC != X86::COND_E && CC != X86::COND_NE) 14043 return SDValue(); 14044 14045 // Check CMP operands. One of them should be 0 or 1 and the other should be 14046 // an SetCC or extended from it. 14047 SDValue Op1 = Cmp.getOperand(0); 14048 SDValue Op2 = Cmp.getOperand(1); 14049 14050 SDValue SetCC; 14051 const ConstantSDNode* C = 0; 14052 bool needOppositeCond = (CC == X86::COND_E); 14053 14054 if ((C = dyn_cast<ConstantSDNode>(Op1))) 14055 SetCC = Op2; 14056 else if ((C = dyn_cast<ConstantSDNode>(Op2))) 14057 SetCC = Op1; 14058 else // Quit if all operands are not constants. 14059 return SDValue(); 14060 14061 if (C->getZExtValue() == 1) 14062 needOppositeCond = !needOppositeCond; 14063 else if (C->getZExtValue() != 0) 14064 // Quit if the constant is neither 0 or 1. 14065 return SDValue(); 14066 14067 // Skip 'zext' node. 14068 if (SetCC.getOpcode() == ISD::ZERO_EXTEND) 14069 SetCC = SetCC.getOperand(0); 14070 14071 // Quit if not SETCC. 14072 // FIXME: So far we only handle the boolean value generated from SETCC. If 14073 // there is other ways to generate boolean values, we need handle them here 14074 // as well. 14075 if (SetCC.getOpcode() != X86ISD::SETCC) 14076 return SDValue(); 14077 14078 // Set the condition code or opposite one if necessary. 14079 CC = X86::CondCode(SetCC.getConstantOperandVal(0)); 14080 if (needOppositeCond) 14081 CC = X86::GetOppositeBranchCondition(CC); 14082 14083 return SetCC.getOperand(1); 14084} 14085 14086/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS 14087/// updated. If only flag result is used and the result is evaluated from a 14088/// series of element extraction, try to combine it into a PTEST. 14089static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC, 14090 SelectionDAG &DAG, 14091 const X86Subtarget *Subtarget) { 14092 SDNode *N = Or.getNode(); 14093 DebugLoc DL = N->getDebugLoc(); 14094 14095 // Only SSE4.1 and beyond supports PTEST or like. 14096 if (!Subtarget->hasSSE41()) 14097 return SDValue(); 14098 14099 if (N->getOpcode() != X86ISD::OR) 14100 return SDValue(); 14101 14102 // Quit if the value result of OR is used. 14103 if (N->hasAnyUseOfValue(0)) 14104 return SDValue(); 14105 14106 // Quit if not used as a boolean value. 14107 if (CC != X86::COND_E && CC != X86::COND_NE) 14108 return SDValue(); 14109 14110 SmallVector<SDValue, 8> Opnds; 14111 SDValue VecIn; 14112 EVT VT = MVT::Other; 14113 unsigned Mask = 0; 14114 14115 // Recognize a special case where a vector is casted into wide integer to 14116 // test all 0s. 14117 Opnds.push_back(N->getOperand(0)); 14118 Opnds.push_back(N->getOperand(1)); 14119 14120 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { 14121 SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; 14122 // BFS traverse all OR'd operands. 14123 if (I->getOpcode() == ISD::OR) { 14124 Opnds.push_back(I->getOperand(0)); 14125 Opnds.push_back(I->getOperand(1)); 14126 // Re-evaluate the number of nodes to be traversed. 14127 e += 2; // 2 more nodes (LHS and RHS) are pushed. 14128 continue; 14129 } 14130 14131 // Quit if a non-EXTRACT_VECTOR_ELT 14132 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 14133 return SDValue(); 14134 14135 // Quit if without a constant index. 14136 SDValue Idx = I->getOperand(1); 14137 if (!isa<ConstantSDNode>(Idx)) 14138 return SDValue(); 14139 14140 // Check if all elements are extracted from the same vector. 14141 SDValue ExtractedFromVec = I->getOperand(0); 14142 if (VecIn.getNode() == 0) { 14143 VT = ExtractedFromVec.getValueType(); 14144 // FIXME: only 128-bit vector is supported so far. 14145 if (!VT.is128BitVector()) 14146 return SDValue(); 14147 VecIn = ExtractedFromVec; 14148 } else if (VecIn != ExtractedFromVec) 14149 return SDValue(); 14150 14151 // Record the constant index. 14152 Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); 14153 } 14154 14155 assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far."); 14156 14157 // Quit if not all elements are used. 14158 if (Mask != (1U << VT.getVectorNumElements()) - 1U) 14159 return SDValue(); 14160 14161 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn); 14162} 14163 14164static bool isValidFCMOVCondition(X86::CondCode CC) { 14165 switch (CC) { 14166 default: 14167 return false; 14168 case X86::COND_B: 14169 case X86::COND_BE: 14170 case X86::COND_E: 14171 case X86::COND_P: 14172 case X86::COND_AE: 14173 case X86::COND_A: 14174 case X86::COND_NE: 14175 case X86::COND_NP: 14176 return true; 14177 } 14178} 14179 14180/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 14181static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 14182 TargetLowering::DAGCombinerInfo &DCI, 14183 const X86Subtarget *Subtarget) { 14184 DebugLoc DL = N->getDebugLoc(); 14185 14186 // If the flag operand isn't dead, don't touch this CMOV. 14187 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 14188 return SDValue(); 14189 14190 SDValue FalseOp = N->getOperand(0); 14191 SDValue TrueOp = N->getOperand(1); 14192 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 14193 SDValue Cond = N->getOperand(3); 14194 14195 if (CC == X86::COND_E || CC == X86::COND_NE) { 14196 switch (Cond.getOpcode()) { 14197 default: break; 14198 case X86ISD::BSR: 14199 case X86ISD::BSF: 14200 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 14201 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 14202 return (CC == X86::COND_E) ? FalseOp : TrueOp; 14203 } 14204 } 14205 14206 SDValue Flags; 14207 14208 Flags = checkBoolTestSetCCCombine(Cond, CC); 14209 if (Flags.getNode() && 14210 // Extra check as FCMOV only supports a subset of X86 cond. 14211 (FalseOp.getValueType() != MVT::f80 || isValidFCMOVCondition(CC))) { 14212 SDValue Ops[] = { FalseOp, TrueOp, 14213 DAG.getConstant(CC, MVT::i8), Flags }; 14214 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), 14215 Ops, array_lengthof(Ops)); 14216 } 14217 14218 Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget); 14219 if (Flags.getNode()) { 14220 SDValue Ops[] = { FalseOp, TrueOp, 14221 DAG.getConstant(CC, MVT::i8), Flags }; 14222 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), 14223 Ops, array_lengthof(Ops)); 14224 } 14225 14226 // If this is a select between two integer constants, try to do some 14227 // optimizations. Note that the operands are ordered the opposite of SELECT 14228 // operands. 14229 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 14230 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 14231 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 14232 // larger than FalseC (the false value). 14233 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 14234 CC = X86::GetOppositeBranchCondition(CC); 14235 std::swap(TrueC, FalseC); 14236 } 14237 14238 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 14239 // This is efficient for any integer data type (including i8/i16) and 14240 // shift amount. 14241 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 14242 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 14243 DAG.getConstant(CC, MVT::i8), Cond); 14244 14245 // Zero extend the condition if needed. 14246 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 14247 14248 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 14249 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 14250 DAG.getConstant(ShAmt, MVT::i8)); 14251 if (N->getNumValues() == 2) // Dead flag value? 14252 return DCI.CombineTo(N, Cond, SDValue()); 14253 return Cond; 14254 } 14255 14256 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 14257 // for any integer data type, including i8/i16. 14258 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 14259 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 14260 DAG.getConstant(CC, MVT::i8), Cond); 14261 14262 // Zero extend the condition if needed. 14263 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 14264 FalseC->getValueType(0), Cond); 14265 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 14266 SDValue(FalseC, 0)); 14267 14268 if (N->getNumValues() == 2) // Dead flag value? 14269 return DCI.CombineTo(N, Cond, SDValue()); 14270 return Cond; 14271 } 14272 14273 // Optimize cases that will turn into an LEA instruction. This requires 14274 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 14275 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 14276 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 14277 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 14278 14279 bool isFastMultiplier = false; 14280 if (Diff < 10) { 14281 switch ((unsigned char)Diff) { 14282 default: break; 14283 case 1: // result = add base, cond 14284 case 2: // result = lea base( , cond*2) 14285 case 3: // result = lea base(cond, cond*2) 14286 case 4: // result = lea base( , cond*4) 14287 case 5: // result = lea base(cond, cond*4) 14288 case 8: // result = lea base( , cond*8) 14289 case 9: // result = lea base(cond, cond*8) 14290 isFastMultiplier = true; 14291 break; 14292 } 14293 } 14294 14295 if (isFastMultiplier) { 14296 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 14297 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 14298 DAG.getConstant(CC, MVT::i8), Cond); 14299 // Zero extend the condition if needed. 14300 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 14301 Cond); 14302 // Scale the condition by the difference. 14303 if (Diff != 1) 14304 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 14305 DAG.getConstant(Diff, Cond.getValueType())); 14306 14307 // Add the base if non-zero. 14308 if (FalseC->getAPIntValue() != 0) 14309 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 14310 SDValue(FalseC, 0)); 14311 if (N->getNumValues() == 2) // Dead flag value? 14312 return DCI.CombineTo(N, Cond, SDValue()); 14313 return Cond; 14314 } 14315 } 14316 } 14317 } 14318 return SDValue(); 14319} 14320 14321 14322/// PerformMulCombine - Optimize a single multiply with constant into two 14323/// in order to implement it with two cheaper instructions, e.g. 14324/// LEA + SHL, LEA + LEA. 14325static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 14326 TargetLowering::DAGCombinerInfo &DCI) { 14327 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 14328 return SDValue(); 14329 14330 EVT VT = N->getValueType(0); 14331 if (VT != MVT::i64) 14332 return SDValue(); 14333 14334 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14335 if (!C) 14336 return SDValue(); 14337 uint64_t MulAmt = C->getZExtValue(); 14338 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 14339 return SDValue(); 14340 14341 uint64_t MulAmt1 = 0; 14342 uint64_t MulAmt2 = 0; 14343 if ((MulAmt % 9) == 0) { 14344 MulAmt1 = 9; 14345 MulAmt2 = MulAmt / 9; 14346 } else if ((MulAmt % 5) == 0) { 14347 MulAmt1 = 5; 14348 MulAmt2 = MulAmt / 5; 14349 } else if ((MulAmt % 3) == 0) { 14350 MulAmt1 = 3; 14351 MulAmt2 = MulAmt / 3; 14352 } 14353 if (MulAmt2 && 14354 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 14355 DebugLoc DL = N->getDebugLoc(); 14356 14357 if (isPowerOf2_64(MulAmt2) && 14358 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 14359 // If second multiplifer is pow2, issue it first. We want the multiply by 14360 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 14361 // is an add. 14362 std::swap(MulAmt1, MulAmt2); 14363 14364 SDValue NewMul; 14365 if (isPowerOf2_64(MulAmt1)) 14366 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 14367 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 14368 else 14369 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 14370 DAG.getConstant(MulAmt1, VT)); 14371 14372 if (isPowerOf2_64(MulAmt2)) 14373 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 14374 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 14375 else 14376 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 14377 DAG.getConstant(MulAmt2, VT)); 14378 14379 // Do not add new nodes to DAG combiner worklist. 14380 DCI.CombineTo(N, NewMul, false); 14381 } 14382 return SDValue(); 14383} 14384 14385static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 14386 SDValue N0 = N->getOperand(0); 14387 SDValue N1 = N->getOperand(1); 14388 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 14389 EVT VT = N0.getValueType(); 14390 14391 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 14392 // since the result of setcc_c is all zero's or all ones. 14393 if (VT.isInteger() && !VT.isVector() && 14394 N1C && N0.getOpcode() == ISD::AND && 14395 N0.getOperand(1).getOpcode() == ISD::Constant) { 14396 SDValue N00 = N0.getOperand(0); 14397 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 14398 ((N00.getOpcode() == ISD::ANY_EXTEND || 14399 N00.getOpcode() == ISD::ZERO_EXTEND) && 14400 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 14401 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 14402 APInt ShAmt = N1C->getAPIntValue(); 14403 Mask = Mask.shl(ShAmt); 14404 if (Mask != 0) 14405 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 14406 N00, DAG.getConstant(Mask, VT)); 14407 } 14408 } 14409 14410 14411 // Hardware support for vector shifts is sparse which makes us scalarize the 14412 // vector operations in many cases. Also, on sandybridge ADD is faster than 14413 // shl. 14414 // (shl V, 1) -> add V,V 14415 if (isSplatVector(N1.getNode())) { 14416 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 14417 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); 14418 // We shift all of the values by one. In many cases we do not have 14419 // hardware support for this operation. This is better expressed as an ADD 14420 // of two values. 14421 if (N1C && (1 == N1C->getZExtValue())) { 14422 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); 14423 } 14424 } 14425 14426 return SDValue(); 14427} 14428 14429/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 14430/// when possible. 14431static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 14432 TargetLowering::DAGCombinerInfo &DCI, 14433 const X86Subtarget *Subtarget) { 14434 EVT VT = N->getValueType(0); 14435 if (N->getOpcode() == ISD::SHL) { 14436 SDValue V = PerformSHLCombine(N, DAG); 14437 if (V.getNode()) return V; 14438 } 14439 14440 // On X86 with SSE2 support, we can transform this to a vector shift if 14441 // all elements are shifted by the same amount. We can't do this in legalize 14442 // because the a constant vector is typically transformed to a constant pool 14443 // so we have no knowledge of the shift amount. 14444 if (!Subtarget->hasSSE2()) 14445 return SDValue(); 14446 14447 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 14448 (!Subtarget->hasAVX2() || 14449 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 14450 return SDValue(); 14451 14452 SDValue ShAmtOp = N->getOperand(1); 14453 EVT EltVT = VT.getVectorElementType(); 14454 DebugLoc DL = N->getDebugLoc(); 14455 SDValue BaseShAmt = SDValue(); 14456 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 14457 unsigned NumElts = VT.getVectorNumElements(); 14458 unsigned i = 0; 14459 for (; i != NumElts; ++i) { 14460 SDValue Arg = ShAmtOp.getOperand(i); 14461 if (Arg.getOpcode() == ISD::UNDEF) continue; 14462 BaseShAmt = Arg; 14463 break; 14464 } 14465 // Handle the case where the build_vector is all undef 14466 // FIXME: Should DAG allow this? 14467 if (i == NumElts) 14468 return SDValue(); 14469 14470 for (; i != NumElts; ++i) { 14471 SDValue Arg = ShAmtOp.getOperand(i); 14472 if (Arg.getOpcode() == ISD::UNDEF) continue; 14473 if (Arg != BaseShAmt) { 14474 return SDValue(); 14475 } 14476 } 14477 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 14478 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 14479 SDValue InVec = ShAmtOp.getOperand(0); 14480 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 14481 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 14482 unsigned i = 0; 14483 for (; i != NumElts; ++i) { 14484 SDValue Arg = InVec.getOperand(i); 14485 if (Arg.getOpcode() == ISD::UNDEF) continue; 14486 BaseShAmt = Arg; 14487 break; 14488 } 14489 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 14490 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 14491 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 14492 if (C->getZExtValue() == SplatIdx) 14493 BaseShAmt = InVec.getOperand(1); 14494 } 14495 } 14496 if (BaseShAmt.getNode() == 0) { 14497 // Don't create instructions with illegal types after legalize 14498 // types has run. 14499 if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) && 14500 !DCI.isBeforeLegalize()) 14501 return SDValue(); 14502 14503 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 14504 DAG.getIntPtrConstant(0)); 14505 } 14506 } else 14507 return SDValue(); 14508 14509 // The shift amount is an i32. 14510 if (EltVT.bitsGT(MVT::i32)) 14511 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 14512 else if (EltVT.bitsLT(MVT::i32)) 14513 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 14514 14515 // The shift amount is identical so we can do a vector shift. 14516 SDValue ValOp = N->getOperand(0); 14517 switch (N->getOpcode()) { 14518 default: 14519 llvm_unreachable("Unknown shift opcode!"); 14520 case ISD::SHL: 14521 switch (VT.getSimpleVT().SimpleTy) { 14522 default: return SDValue(); 14523 case MVT::v2i64: 14524 case MVT::v4i32: 14525 case MVT::v8i16: 14526 case MVT::v4i64: 14527 case MVT::v8i32: 14528 case MVT::v16i16: 14529 return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG); 14530 } 14531 case ISD::SRA: 14532 switch (VT.getSimpleVT().SimpleTy) { 14533 default: return SDValue(); 14534 case MVT::v4i32: 14535 case MVT::v8i16: 14536 case MVT::v8i32: 14537 case MVT::v16i16: 14538 return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG); 14539 } 14540 case ISD::SRL: 14541 switch (VT.getSimpleVT().SimpleTy) { 14542 default: return SDValue(); 14543 case MVT::v2i64: 14544 case MVT::v4i32: 14545 case MVT::v8i16: 14546 case MVT::v4i64: 14547 case MVT::v8i32: 14548 case MVT::v16i16: 14549 return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG); 14550 } 14551 } 14552} 14553 14554 14555// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 14556// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 14557// and friends. Likewise for OR -> CMPNEQSS. 14558static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 14559 TargetLowering::DAGCombinerInfo &DCI, 14560 const X86Subtarget *Subtarget) { 14561 unsigned opcode; 14562 14563 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 14564 // we're requiring SSE2 for both. 14565 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 14566 SDValue N0 = N->getOperand(0); 14567 SDValue N1 = N->getOperand(1); 14568 SDValue CMP0 = N0->getOperand(1); 14569 SDValue CMP1 = N1->getOperand(1); 14570 DebugLoc DL = N->getDebugLoc(); 14571 14572 // The SETCCs should both refer to the same CMP. 14573 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 14574 return SDValue(); 14575 14576 SDValue CMP00 = CMP0->getOperand(0); 14577 SDValue CMP01 = CMP0->getOperand(1); 14578 EVT VT = CMP00.getValueType(); 14579 14580 if (VT == MVT::f32 || VT == MVT::f64) { 14581 bool ExpectingFlags = false; 14582 // Check for any users that want flags: 14583 for (SDNode::use_iterator UI = N->use_begin(), 14584 UE = N->use_end(); 14585 !ExpectingFlags && UI != UE; ++UI) 14586 switch (UI->getOpcode()) { 14587 default: 14588 case ISD::BR_CC: 14589 case ISD::BRCOND: 14590 case ISD::SELECT: 14591 ExpectingFlags = true; 14592 break; 14593 case ISD::CopyToReg: 14594 case ISD::SIGN_EXTEND: 14595 case ISD::ZERO_EXTEND: 14596 case ISD::ANY_EXTEND: 14597 break; 14598 } 14599 14600 if (!ExpectingFlags) { 14601 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 14602 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 14603 14604 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 14605 X86::CondCode tmp = cc0; 14606 cc0 = cc1; 14607 cc1 = tmp; 14608 } 14609 14610 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 14611 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 14612 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 14613 X86ISD::NodeType NTOperator = is64BitFP ? 14614 X86ISD::FSETCCsd : X86ISD::FSETCCss; 14615 // FIXME: need symbolic constants for these magic numbers. 14616 // See X86ATTInstPrinter.cpp:printSSECC(). 14617 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 14618 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 14619 DAG.getConstant(x86cc, MVT::i8)); 14620 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 14621 OnesOrZeroesF); 14622 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 14623 DAG.getConstant(1, MVT::i32)); 14624 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 14625 return OneBitOfTruth; 14626 } 14627 } 14628 } 14629 } 14630 return SDValue(); 14631} 14632 14633/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 14634/// so it can be folded inside ANDNP. 14635static bool CanFoldXORWithAllOnes(const SDNode *N) { 14636 EVT VT = N->getValueType(0); 14637 14638 // Match direct AllOnes for 128 and 256-bit vectors 14639 if (ISD::isBuildVectorAllOnes(N)) 14640 return true; 14641 14642 // Look through a bit convert. 14643 if (N->getOpcode() == ISD::BITCAST) 14644 N = N->getOperand(0).getNode(); 14645 14646 // Sometimes the operand may come from a insert_subvector building a 256-bit 14647 // allones vector 14648 if (VT.is256BitVector() && 14649 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 14650 SDValue V1 = N->getOperand(0); 14651 SDValue V2 = N->getOperand(1); 14652 14653 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 14654 V1.getOperand(0).getOpcode() == ISD::UNDEF && 14655 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 14656 ISD::isBuildVectorAllOnes(V2.getNode())) 14657 return true; 14658 } 14659 14660 return false; 14661} 14662 14663static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 14664 TargetLowering::DAGCombinerInfo &DCI, 14665 const X86Subtarget *Subtarget) { 14666 if (DCI.isBeforeLegalizeOps()) 14667 return SDValue(); 14668 14669 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 14670 if (R.getNode()) 14671 return R; 14672 14673 EVT VT = N->getValueType(0); 14674 14675 // Create ANDN, BLSI, and BLSR instructions 14676 // BLSI is X & (-X) 14677 // BLSR is X & (X-1) 14678 if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { 14679 SDValue N0 = N->getOperand(0); 14680 SDValue N1 = N->getOperand(1); 14681 DebugLoc DL = N->getDebugLoc(); 14682 14683 // Check LHS for not 14684 if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1))) 14685 return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1); 14686 // Check RHS for not 14687 if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1))) 14688 return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0); 14689 14690 // Check LHS for neg 14691 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && 14692 isZero(N0.getOperand(0))) 14693 return DAG.getNode(X86ISD::BLSI, DL, VT, N1); 14694 14695 // Check RHS for neg 14696 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && 14697 isZero(N1.getOperand(0))) 14698 return DAG.getNode(X86ISD::BLSI, DL, VT, N0); 14699 14700 // Check LHS for X-1 14701 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 14702 isAllOnes(N0.getOperand(1))) 14703 return DAG.getNode(X86ISD::BLSR, DL, VT, N1); 14704 14705 // Check RHS for X-1 14706 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 14707 isAllOnes(N1.getOperand(1))) 14708 return DAG.getNode(X86ISD::BLSR, DL, VT, N0); 14709 14710 return SDValue(); 14711 } 14712 14713 // Want to form ANDNP nodes: 14714 // 1) In the hopes of then easily combining them with OR and AND nodes 14715 // to form PBLEND/PSIGN. 14716 // 2) To match ANDN packed intrinsics 14717 if (VT != MVT::v2i64 && VT != MVT::v4i64) 14718 return SDValue(); 14719 14720 SDValue N0 = N->getOperand(0); 14721 SDValue N1 = N->getOperand(1); 14722 DebugLoc DL = N->getDebugLoc(); 14723 14724 // Check LHS for vnot 14725 if (N0.getOpcode() == ISD::XOR && 14726 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 14727 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 14728 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 14729 14730 // Check RHS for vnot 14731 if (N1.getOpcode() == ISD::XOR && 14732 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 14733 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 14734 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 14735 14736 return SDValue(); 14737} 14738 14739static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 14740 TargetLowering::DAGCombinerInfo &DCI, 14741 const X86Subtarget *Subtarget) { 14742 if (DCI.isBeforeLegalizeOps()) 14743 return SDValue(); 14744 14745 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 14746 if (R.getNode()) 14747 return R; 14748 14749 EVT VT = N->getValueType(0); 14750 14751 SDValue N0 = N->getOperand(0); 14752 SDValue N1 = N->getOperand(1); 14753 14754 // look for psign/blend 14755 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 14756 if (!Subtarget->hasSSSE3() || 14757 (VT == MVT::v4i64 && !Subtarget->hasAVX2())) 14758 return SDValue(); 14759 14760 // Canonicalize pandn to RHS 14761 if (N0.getOpcode() == X86ISD::ANDNP) 14762 std::swap(N0, N1); 14763 // or (and (m, y), (pandn m, x)) 14764 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 14765 SDValue Mask = N1.getOperand(0); 14766 SDValue X = N1.getOperand(1); 14767 SDValue Y; 14768 if (N0.getOperand(0) == Mask) 14769 Y = N0.getOperand(1); 14770 if (N0.getOperand(1) == Mask) 14771 Y = N0.getOperand(0); 14772 14773 // Check to see if the mask appeared in both the AND and ANDNP and 14774 if (!Y.getNode()) 14775 return SDValue(); 14776 14777 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 14778 // Look through mask bitcast. 14779 if (Mask.getOpcode() == ISD::BITCAST) 14780 Mask = Mask.getOperand(0); 14781 if (X.getOpcode() == ISD::BITCAST) 14782 X = X.getOperand(0); 14783 if (Y.getOpcode() == ISD::BITCAST) 14784 Y = Y.getOperand(0); 14785 14786 EVT MaskVT = Mask.getValueType(); 14787 14788 // Validate that the Mask operand is a vector sra node. 14789 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 14790 // there is no psrai.b 14791 if (Mask.getOpcode() != X86ISD::VSRAI) 14792 return SDValue(); 14793 14794 // Check that the SRA is all signbits. 14795 SDValue SraC = Mask.getOperand(1); 14796 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 14797 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 14798 if ((SraAmt + 1) != EltBits) 14799 return SDValue(); 14800 14801 DebugLoc DL = N->getDebugLoc(); 14802 14803 // Now we know we at least have a plendvb with the mask val. See if 14804 // we can form a psignb/w/d. 14805 // psign = x.type == y.type == mask.type && y = sub(0, x); 14806 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 14807 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 14808 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 14809 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 14810 "Unsupported VT for PSIGN"); 14811 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); 14812 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 14813 } 14814 // PBLENDVB only available on SSE 4.1 14815 if (!Subtarget->hasSSE41()) 14816 return SDValue(); 14817 14818 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 14819 14820 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); 14821 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); 14822 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); 14823 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 14824 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 14825 } 14826 } 14827 14828 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 14829 return SDValue(); 14830 14831 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 14832 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 14833 std::swap(N0, N1); 14834 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 14835 return SDValue(); 14836 if (!N0.hasOneUse() || !N1.hasOneUse()) 14837 return SDValue(); 14838 14839 SDValue ShAmt0 = N0.getOperand(1); 14840 if (ShAmt0.getValueType() != MVT::i8) 14841 return SDValue(); 14842 SDValue ShAmt1 = N1.getOperand(1); 14843 if (ShAmt1.getValueType() != MVT::i8) 14844 return SDValue(); 14845 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 14846 ShAmt0 = ShAmt0.getOperand(0); 14847 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 14848 ShAmt1 = ShAmt1.getOperand(0); 14849 14850 DebugLoc DL = N->getDebugLoc(); 14851 unsigned Opc = X86ISD::SHLD; 14852 SDValue Op0 = N0.getOperand(0); 14853 SDValue Op1 = N1.getOperand(0); 14854 if (ShAmt0.getOpcode() == ISD::SUB) { 14855 Opc = X86ISD::SHRD; 14856 std::swap(Op0, Op1); 14857 std::swap(ShAmt0, ShAmt1); 14858 } 14859 14860 unsigned Bits = VT.getSizeInBits(); 14861 if (ShAmt1.getOpcode() == ISD::SUB) { 14862 SDValue Sum = ShAmt1.getOperand(0); 14863 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 14864 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 14865 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 14866 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 14867 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 14868 return DAG.getNode(Opc, DL, VT, 14869 Op0, Op1, 14870 DAG.getNode(ISD::TRUNCATE, DL, 14871 MVT::i8, ShAmt0)); 14872 } 14873 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 14874 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 14875 if (ShAmt0C && 14876 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 14877 return DAG.getNode(Opc, DL, VT, 14878 N0.getOperand(0), N1.getOperand(0), 14879 DAG.getNode(ISD::TRUNCATE, DL, 14880 MVT::i8, ShAmt0)); 14881 } 14882 14883 return SDValue(); 14884} 14885 14886// Generate NEG and CMOV for integer abs. 14887static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 14888 EVT VT = N->getValueType(0); 14889 14890 // Since X86 does not have CMOV for 8-bit integer, we don't convert 14891 // 8-bit integer abs to NEG and CMOV. 14892 if (VT.isInteger() && VT.getSizeInBits() == 8) 14893 return SDValue(); 14894 14895 SDValue N0 = N->getOperand(0); 14896 SDValue N1 = N->getOperand(1); 14897 DebugLoc DL = N->getDebugLoc(); 14898 14899 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 14900 // and change it to SUB and CMOV. 14901 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 14902 N0.getOpcode() == ISD::ADD && 14903 N0.getOperand(1) == N1 && 14904 N1.getOpcode() == ISD::SRA && 14905 N1.getOperand(0) == N0.getOperand(0)) 14906 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 14907 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { 14908 // Generate SUB & CMOV. 14909 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), 14910 DAG.getConstant(0, VT), N0.getOperand(0)); 14911 14912 SDValue Ops[] = { N0.getOperand(0), Neg, 14913 DAG.getConstant(X86::COND_GE, MVT::i8), 14914 SDValue(Neg.getNode(), 1) }; 14915 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), 14916 Ops, array_lengthof(Ops)); 14917 } 14918 return SDValue(); 14919} 14920 14921// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes 14922static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 14923 TargetLowering::DAGCombinerInfo &DCI, 14924 const X86Subtarget *Subtarget) { 14925 if (DCI.isBeforeLegalizeOps()) 14926 return SDValue(); 14927 14928 if (Subtarget->hasCMov()) { 14929 SDValue RV = performIntegerAbsCombine(N, DAG); 14930 if (RV.getNode()) 14931 return RV; 14932 } 14933 14934 // Try forming BMI if it is available. 14935 if (!Subtarget->hasBMI()) 14936 return SDValue(); 14937 14938 EVT VT = N->getValueType(0); 14939 14940 if (VT != MVT::i32 && VT != MVT::i64) 14941 return SDValue(); 14942 14943 assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); 14944 14945 // Create BLSMSK instructions by finding X ^ (X-1) 14946 SDValue N0 = N->getOperand(0); 14947 SDValue N1 = N->getOperand(1); 14948 DebugLoc DL = N->getDebugLoc(); 14949 14950 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 14951 isAllOnes(N0.getOperand(1))) 14952 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); 14953 14954 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 14955 isAllOnes(N1.getOperand(1))) 14956 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); 14957 14958 return SDValue(); 14959} 14960 14961/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 14962static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 14963 TargetLowering::DAGCombinerInfo &DCI, 14964 const X86Subtarget *Subtarget) { 14965 LoadSDNode *Ld = cast<LoadSDNode>(N); 14966 EVT RegVT = Ld->getValueType(0); 14967 EVT MemVT = Ld->getMemoryVT(); 14968 DebugLoc dl = Ld->getDebugLoc(); 14969 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14970 14971 ISD::LoadExtType Ext = Ld->getExtensionType(); 14972 14973 // If this is a vector EXT Load then attempt to optimize it using a 14974 // shuffle. We need SSE4 for the shuffles. 14975 // TODO: It is possible to support ZExt by zeroing the undef values 14976 // during the shuffle phase or after the shuffle. 14977 if (RegVT.isVector() && RegVT.isInteger() && 14978 Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { 14979 assert(MemVT != RegVT && "Cannot extend to the same type"); 14980 assert(MemVT.isVector() && "Must load a vector from memory"); 14981 14982 unsigned NumElems = RegVT.getVectorNumElements(); 14983 unsigned RegSz = RegVT.getSizeInBits(); 14984 unsigned MemSz = MemVT.getSizeInBits(); 14985 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 14986 14987 // All sizes must be a power of two. 14988 if (!isPowerOf2_32(RegSz * MemSz * NumElems)) 14989 return SDValue(); 14990 14991 // Attempt to load the original value using scalar loads. 14992 // Find the largest scalar type that divides the total loaded size. 14993 MVT SclrLoadTy = MVT::i8; 14994 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 14995 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 14996 MVT Tp = (MVT::SimpleValueType)tp; 14997 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { 14998 SclrLoadTy = Tp; 14999 } 15000 } 15001 15002 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 15003 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && 15004 (64 <= MemSz)) 15005 SclrLoadTy = MVT::f64; 15006 15007 // Calculate the number of scalar loads that we need to perform 15008 // in order to load our vector from memory. 15009 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); 15010 15011 // Represent our vector as a sequence of elements which are the 15012 // largest scalar that we can load. 15013 EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, 15014 RegSz/SclrLoadTy.getSizeInBits()); 15015 15016 // Represent the data using the same element type that is stored in 15017 // memory. In practice, we ''widen'' MemVT. 15018 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 15019 RegSz/MemVT.getScalarType().getSizeInBits()); 15020 15021 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && 15022 "Invalid vector type"); 15023 15024 // We can't shuffle using an illegal type. 15025 if (!TLI.isTypeLegal(WideVecVT)) 15026 return SDValue(); 15027 15028 SmallVector<SDValue, 8> Chains; 15029 SDValue Ptr = Ld->getBasePtr(); 15030 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, 15031 TLI.getPointerTy()); 15032 SDValue Res = DAG.getUNDEF(LoadUnitVecVT); 15033 15034 for (unsigned i = 0; i < NumLoads; ++i) { 15035 // Perform a single load. 15036 SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), 15037 Ptr, Ld->getPointerInfo(), 15038 Ld->isVolatile(), Ld->isNonTemporal(), 15039 Ld->isInvariant(), Ld->getAlignment()); 15040 Chains.push_back(ScalarLoad.getValue(1)); 15041 // Create the first element type using SCALAR_TO_VECTOR in order to avoid 15042 // another round of DAGCombining. 15043 if (i == 0) 15044 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); 15045 else 15046 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, 15047 ScalarLoad, DAG.getIntPtrConstant(i)); 15048 15049 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 15050 } 15051 15052 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 15053 Chains.size()); 15054 15055 // Bitcast the loaded value to a vector of the original element type, in 15056 // the size of the target vector type. 15057 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); 15058 unsigned SizeRatio = RegSz/MemSz; 15059 15060 // Redistribute the loaded elements into the different locations. 15061 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 15062 for (unsigned i = 0; i != NumElems; ++i) 15063 ShuffleVec[i*SizeRatio] = i; 15064 15065 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 15066 DAG.getUNDEF(WideVecVT), 15067 &ShuffleVec[0]); 15068 15069 // Bitcast to the requested type. 15070 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 15071 // Replace the original load with the new sequence 15072 // and return the new chain. 15073 return DCI.CombineTo(N, Shuff, TF, true); 15074 } 15075 15076 return SDValue(); 15077} 15078 15079/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 15080static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 15081 const X86Subtarget *Subtarget) { 15082 StoreSDNode *St = cast<StoreSDNode>(N); 15083 EVT VT = St->getValue().getValueType(); 15084 EVT StVT = St->getMemoryVT(); 15085 DebugLoc dl = St->getDebugLoc(); 15086 SDValue StoredVal = St->getOperand(1); 15087 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15088 15089 // If we are saving a concatenation of two XMM registers, perform two stores. 15090 // On Sandy Bridge, 256-bit memory operations are executed by two 15091 // 128-bit ports. However, on Haswell it is better to issue a single 256-bit 15092 // memory operation. 15093 if (VT.is256BitVector() && !Subtarget->hasAVX2() && 15094 StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && 15095 StoredVal.getNumOperands() == 2) { 15096 SDValue Value0 = StoredVal.getOperand(0); 15097 SDValue Value1 = StoredVal.getOperand(1); 15098 15099 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 15100 SDValue Ptr0 = St->getBasePtr(); 15101 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 15102 15103 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 15104 St->getPointerInfo(), St->isVolatile(), 15105 St->isNonTemporal(), St->getAlignment()); 15106 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 15107 St->getPointerInfo(), St->isVolatile(), 15108 St->isNonTemporal(), St->getAlignment()); 15109 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 15110 } 15111 15112 // Optimize trunc store (of multiple scalars) to shuffle and store. 15113 // First, pack all of the elements in one place. Next, store to memory 15114 // in fewer chunks. 15115 if (St->isTruncatingStore() && VT.isVector()) { 15116 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15117 unsigned NumElems = VT.getVectorNumElements(); 15118 assert(StVT != VT && "Cannot truncate to the same type"); 15119 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 15120 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 15121 15122 // From, To sizes and ElemCount must be pow of two 15123 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 15124 // We are going to use the original vector elt for storing. 15125 // Accumulated smaller vector elements must be a multiple of the store size. 15126 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 15127 15128 unsigned SizeRatio = FromSz / ToSz; 15129 15130 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 15131 15132 // Create a type on which we perform the shuffle 15133 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 15134 StVT.getScalarType(), NumElems*SizeRatio); 15135 15136 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 15137 15138 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 15139 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 15140 for (unsigned i = 0; i != NumElems; ++i) 15141 ShuffleVec[i] = i * SizeRatio; 15142 15143 // Can't shuffle using an illegal type. 15144 if (!TLI.isTypeLegal(WideVecVT)) 15145 return SDValue(); 15146 15147 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 15148 DAG.getUNDEF(WideVecVT), 15149 &ShuffleVec[0]); 15150 // At this point all of the data is stored at the bottom of the 15151 // register. We now need to save it to mem. 15152 15153 // Find the largest store unit 15154 MVT StoreType = MVT::i8; 15155 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 15156 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 15157 MVT Tp = (MVT::SimpleValueType)tp; 15158 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) 15159 StoreType = Tp; 15160 } 15161 15162 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 15163 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && 15164 (64 <= NumElems * ToSz)) 15165 StoreType = MVT::f64; 15166 15167 // Bitcast the original vector into a vector of store-size units 15168 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 15169 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); 15170 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 15171 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 15172 SmallVector<SDValue, 8> Chains; 15173 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 15174 TLI.getPointerTy()); 15175 SDValue Ptr = St->getBasePtr(); 15176 15177 // Perform one or more big stores into memory. 15178 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { 15179 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 15180 StoreType, ShuffWide, 15181 DAG.getIntPtrConstant(i)); 15182 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 15183 St->getPointerInfo(), St->isVolatile(), 15184 St->isNonTemporal(), St->getAlignment()); 15185 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 15186 Chains.push_back(Ch); 15187 } 15188 15189 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 15190 Chains.size()); 15191 } 15192 15193 15194 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 15195 // the FP state in cases where an emms may be missing. 15196 // A preferable solution to the general problem is to figure out the right 15197 // places to insert EMMS. This qualifies as a quick hack. 15198 15199 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 15200 if (VT.getSizeInBits() != 64) 15201 return SDValue(); 15202 15203 const Function *F = DAG.getMachineFunction().getFunction(); 15204 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 15205 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps 15206 && Subtarget->hasSSE2(); 15207 if ((VT.isVector() || 15208 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 15209 isa<LoadSDNode>(St->getValue()) && 15210 !cast<LoadSDNode>(St->getValue())->isVolatile() && 15211 St->getChain().hasOneUse() && !St->isVolatile()) { 15212 SDNode* LdVal = St->getValue().getNode(); 15213 LoadSDNode *Ld = 0; 15214 int TokenFactorIndex = -1; 15215 SmallVector<SDValue, 8> Ops; 15216 SDNode* ChainVal = St->getChain().getNode(); 15217 // Must be a store of a load. We currently handle two cases: the load 15218 // is a direct child, and it's under an intervening TokenFactor. It is 15219 // possible to dig deeper under nested TokenFactors. 15220 if (ChainVal == LdVal) 15221 Ld = cast<LoadSDNode>(St->getChain()); 15222 else if (St->getValue().hasOneUse() && 15223 ChainVal->getOpcode() == ISD::TokenFactor) { 15224 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 15225 if (ChainVal->getOperand(i).getNode() == LdVal) { 15226 TokenFactorIndex = i; 15227 Ld = cast<LoadSDNode>(St->getValue()); 15228 } else 15229 Ops.push_back(ChainVal->getOperand(i)); 15230 } 15231 } 15232 15233 if (!Ld || !ISD::isNormalLoad(Ld)) 15234 return SDValue(); 15235 15236 // If this is not the MMX case, i.e. we are just turning i64 load/store 15237 // into f64 load/store, avoid the transformation if there are multiple 15238 // uses of the loaded value. 15239 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 15240 return SDValue(); 15241 15242 DebugLoc LdDL = Ld->getDebugLoc(); 15243 DebugLoc StDL = N->getDebugLoc(); 15244 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 15245 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 15246 // pair instead. 15247 if (Subtarget->is64Bit() || F64IsLegal) { 15248 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 15249 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 15250 Ld->getPointerInfo(), Ld->isVolatile(), 15251 Ld->isNonTemporal(), Ld->isInvariant(), 15252 Ld->getAlignment()); 15253 SDValue NewChain = NewLd.getValue(1); 15254 if (TokenFactorIndex != -1) { 15255 Ops.push_back(NewChain); 15256 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 15257 Ops.size()); 15258 } 15259 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 15260 St->getPointerInfo(), 15261 St->isVolatile(), St->isNonTemporal(), 15262 St->getAlignment()); 15263 } 15264 15265 // Otherwise, lower to two pairs of 32-bit loads / stores. 15266 SDValue LoAddr = Ld->getBasePtr(); 15267 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 15268 DAG.getConstant(4, MVT::i32)); 15269 15270 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 15271 Ld->getPointerInfo(), 15272 Ld->isVolatile(), Ld->isNonTemporal(), 15273 Ld->isInvariant(), Ld->getAlignment()); 15274 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 15275 Ld->getPointerInfo().getWithOffset(4), 15276 Ld->isVolatile(), Ld->isNonTemporal(), 15277 Ld->isInvariant(), 15278 MinAlign(Ld->getAlignment(), 4)); 15279 15280 SDValue NewChain = LoLd.getValue(1); 15281 if (TokenFactorIndex != -1) { 15282 Ops.push_back(LoLd); 15283 Ops.push_back(HiLd); 15284 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 15285 Ops.size()); 15286 } 15287 15288 LoAddr = St->getBasePtr(); 15289 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 15290 DAG.getConstant(4, MVT::i32)); 15291 15292 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 15293 St->getPointerInfo(), 15294 St->isVolatile(), St->isNonTemporal(), 15295 St->getAlignment()); 15296 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 15297 St->getPointerInfo().getWithOffset(4), 15298 St->isVolatile(), 15299 St->isNonTemporal(), 15300 MinAlign(St->getAlignment(), 4)); 15301 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 15302 } 15303 return SDValue(); 15304} 15305 15306/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" 15307/// and return the operands for the horizontal operation in LHS and RHS. A 15308/// horizontal operation performs the binary operation on successive elements 15309/// of its first operand, then on successive elements of its second operand, 15310/// returning the resulting values in a vector. For example, if 15311/// A = < float a0, float a1, float a2, float a3 > 15312/// and 15313/// B = < float b0, float b1, float b2, float b3 > 15314/// then the result of doing a horizontal operation on A and B is 15315/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 15316/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 15317/// A horizontal-op B, for some already available A and B, and if so then LHS is 15318/// set to A, RHS to B, and the routine returns 'true'. 15319/// Note that the binary operation should have the property that if one of the 15320/// operands is UNDEF then the result is UNDEF. 15321static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 15322 // Look for the following pattern: if 15323 // A = < float a0, float a1, float a2, float a3 > 15324 // B = < float b0, float b1, float b2, float b3 > 15325 // and 15326 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 15327 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 15328 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 15329 // which is A horizontal-op B. 15330 15331 // At least one of the operands should be a vector shuffle. 15332 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 15333 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 15334 return false; 15335 15336 EVT VT = LHS.getValueType(); 15337 15338 assert((VT.is128BitVector() || VT.is256BitVector()) && 15339 "Unsupported vector type for horizontal add/sub"); 15340 15341 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 15342 // operate independently on 128-bit lanes. 15343 unsigned NumElts = VT.getVectorNumElements(); 15344 unsigned NumLanes = VT.getSizeInBits()/128; 15345 unsigned NumLaneElts = NumElts / NumLanes; 15346 assert((NumLaneElts % 2 == 0) && 15347 "Vector type should have an even number of elements in each lane"); 15348 unsigned HalfLaneElts = NumLaneElts/2; 15349 15350 // View LHS in the form 15351 // LHS = VECTOR_SHUFFLE A, B, LMask 15352 // If LHS is not a shuffle then pretend it is the shuffle 15353 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 15354 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 15355 // type VT. 15356 SDValue A, B; 15357 SmallVector<int, 16> LMask(NumElts); 15358 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 15359 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 15360 A = LHS.getOperand(0); 15361 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 15362 B = LHS.getOperand(1); 15363 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 15364 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 15365 } else { 15366 if (LHS.getOpcode() != ISD::UNDEF) 15367 A = LHS; 15368 for (unsigned i = 0; i != NumElts; ++i) 15369 LMask[i] = i; 15370 } 15371 15372 // Likewise, view RHS in the form 15373 // RHS = VECTOR_SHUFFLE C, D, RMask 15374 SDValue C, D; 15375 SmallVector<int, 16> RMask(NumElts); 15376 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 15377 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 15378 C = RHS.getOperand(0); 15379 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 15380 D = RHS.getOperand(1); 15381 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 15382 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 15383 } else { 15384 if (RHS.getOpcode() != ISD::UNDEF) 15385 C = RHS; 15386 for (unsigned i = 0; i != NumElts; ++i) 15387 RMask[i] = i; 15388 } 15389 15390 // Check that the shuffles are both shuffling the same vectors. 15391 if (!(A == C && B == D) && !(A == D && B == C)) 15392 return false; 15393 15394 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 15395 if (!A.getNode() && !B.getNode()) 15396 return false; 15397 15398 // If A and B occur in reverse order in RHS, then "swap" them (which means 15399 // rewriting the mask). 15400 if (A != C) 15401 CommuteVectorShuffleMask(RMask, NumElts); 15402 15403 // At this point LHS and RHS are equivalent to 15404 // LHS = VECTOR_SHUFFLE A, B, LMask 15405 // RHS = VECTOR_SHUFFLE A, B, RMask 15406 // Check that the masks correspond to performing a horizontal operation. 15407 for (unsigned i = 0; i != NumElts; ++i) { 15408 int LIdx = LMask[i], RIdx = RMask[i]; 15409 15410 // Ignore any UNDEF components. 15411 if (LIdx < 0 || RIdx < 0 || 15412 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 15413 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 15414 continue; 15415 15416 // Check that successive elements are being operated on. If not, this is 15417 // not a horizontal operation. 15418 unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs 15419 unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; 15420 int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; 15421 if (!(LIdx == Index && RIdx == Index + 1) && 15422 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 15423 return false; 15424 } 15425 15426 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 15427 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 15428 return true; 15429} 15430 15431/// PerformFADDCombine - Do target-specific dag combines on floating point adds. 15432static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 15433 const X86Subtarget *Subtarget) { 15434 EVT VT = N->getValueType(0); 15435 SDValue LHS = N->getOperand(0); 15436 SDValue RHS = N->getOperand(1); 15437 15438 // Try to synthesize horizontal adds from adds of shuffles. 15439 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 15440 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 15441 isHorizontalBinOp(LHS, RHS, true)) 15442 return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); 15443 return SDValue(); 15444} 15445 15446/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. 15447static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 15448 const X86Subtarget *Subtarget) { 15449 EVT VT = N->getValueType(0); 15450 SDValue LHS = N->getOperand(0); 15451 SDValue RHS = N->getOperand(1); 15452 15453 // Try to synthesize horizontal subs from subs of shuffles. 15454 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 15455 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 15456 isHorizontalBinOp(LHS, RHS, false)) 15457 return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); 15458 return SDValue(); 15459} 15460 15461/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 15462/// X86ISD::FXOR nodes. 15463static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 15464 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 15465 // F[X]OR(0.0, x) -> x 15466 // F[X]OR(x, 0.0) -> x 15467 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 15468 if (C->getValueAPF().isPosZero()) 15469 return N->getOperand(1); 15470 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 15471 if (C->getValueAPF().isPosZero()) 15472 return N->getOperand(0); 15473 return SDValue(); 15474} 15475 15476/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and 15477/// X86ISD::FMAX nodes. 15478static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { 15479 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); 15480 15481 // Only perform optimizations if UnsafeMath is used. 15482 if (!DAG.getTarget().Options.UnsafeFPMath) 15483 return SDValue(); 15484 15485 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes 15486 // into FMINC and MMAXC, which are Commutative operations. 15487 unsigned NewOp = 0; 15488 switch (N->getOpcode()) { 15489 default: llvm_unreachable("unknown opcode"); 15490 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; 15491 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; 15492 } 15493 15494 return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0), 15495 N->getOperand(0), N->getOperand(1)); 15496} 15497 15498 15499/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 15500static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 15501 // FAND(0.0, x) -> 0.0 15502 // FAND(x, 0.0) -> 0.0 15503 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 15504 if (C->getValueAPF().isPosZero()) 15505 return N->getOperand(0); 15506 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 15507 if (C->getValueAPF().isPosZero()) 15508 return N->getOperand(1); 15509 return SDValue(); 15510} 15511 15512static SDValue PerformBTCombine(SDNode *N, 15513 SelectionDAG &DAG, 15514 TargetLowering::DAGCombinerInfo &DCI) { 15515 // BT ignores high bits in the bit index operand. 15516 SDValue Op1 = N->getOperand(1); 15517 if (Op1.hasOneUse()) { 15518 unsigned BitWidth = Op1.getValueSizeInBits(); 15519 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 15520 APInt KnownZero, KnownOne; 15521 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 15522 !DCI.isBeforeLegalizeOps()); 15523 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15524 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 15525 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 15526 DCI.CommitTargetLoweringOpt(TLO); 15527 } 15528 return SDValue(); 15529} 15530 15531static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 15532 SDValue Op = N->getOperand(0); 15533 if (Op.getOpcode() == ISD::BITCAST) 15534 Op = Op.getOperand(0); 15535 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 15536 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 15537 VT.getVectorElementType().getSizeInBits() == 15538 OpVT.getVectorElementType().getSizeInBits()) { 15539 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 15540 } 15541 return SDValue(); 15542} 15543 15544static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, 15545 TargetLowering::DAGCombinerInfo &DCI, 15546 const X86Subtarget *Subtarget) { 15547 if (!DCI.isBeforeLegalizeOps()) 15548 return SDValue(); 15549 15550 if (!Subtarget->hasAVX()) 15551 return SDValue(); 15552 15553 EVT VT = N->getValueType(0); 15554 SDValue Op = N->getOperand(0); 15555 EVT OpVT = Op.getValueType(); 15556 DebugLoc dl = N->getDebugLoc(); 15557 15558 if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || 15559 (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { 15560 15561 if (Subtarget->hasAVX2()) 15562 return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op); 15563 15564 // Optimize vectors in AVX mode 15565 // Sign extend v8i16 to v8i32 and 15566 // v4i32 to v4i64 15567 // 15568 // Divide input vector into two parts 15569 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 15570 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 15571 // concat the vectors to original VT 15572 15573 unsigned NumElems = OpVT.getVectorNumElements(); 15574 SDValue Undef = DAG.getUNDEF(OpVT); 15575 15576 SmallVector<int,8> ShufMask1(NumElems, -1); 15577 for (unsigned i = 0; i != NumElems/2; ++i) 15578 ShufMask1[i] = i; 15579 15580 SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]); 15581 15582 SmallVector<int,8> ShufMask2(NumElems, -1); 15583 for (unsigned i = 0; i != NumElems/2; ++i) 15584 ShufMask2[i] = i + NumElems/2; 15585 15586 SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]); 15587 15588 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 15589 VT.getVectorNumElements()/2); 15590 15591 OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 15592 OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); 15593 15594 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 15595 } 15596 return SDValue(); 15597} 15598 15599static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, 15600 const X86Subtarget* Subtarget) { 15601 DebugLoc dl = N->getDebugLoc(); 15602 EVT VT = N->getValueType(0); 15603 15604 EVT ScalarVT = VT.getScalarType(); 15605 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA()) 15606 return SDValue(); 15607 15608 SDValue A = N->getOperand(0); 15609 SDValue B = N->getOperand(1); 15610 SDValue C = N->getOperand(2); 15611 15612 bool NegA = (A.getOpcode() == ISD::FNEG); 15613 bool NegB = (B.getOpcode() == ISD::FNEG); 15614 bool NegC = (C.getOpcode() == ISD::FNEG); 15615 15616 // Negative multiplication when NegA xor NegB 15617 bool NegMul = (NegA != NegB); 15618 if (NegA) 15619 A = A.getOperand(0); 15620 if (NegB) 15621 B = B.getOperand(0); 15622 if (NegC) 15623 C = C.getOperand(0); 15624 15625 unsigned Opcode; 15626 if (!NegMul) 15627 Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB; 15628 else 15629 Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB; 15630 return DAG.getNode(Opcode, dl, VT, A, B, C); 15631} 15632 15633static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, 15634 TargetLowering::DAGCombinerInfo &DCI, 15635 const X86Subtarget *Subtarget) { 15636 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 15637 // (and (i32 x86isd::setcc_carry), 1) 15638 // This eliminates the zext. This transformation is necessary because 15639 // ISD::SETCC is always legalized to i8. 15640 DebugLoc dl = N->getDebugLoc(); 15641 SDValue N0 = N->getOperand(0); 15642 EVT VT = N->getValueType(0); 15643 EVT OpVT = N0.getValueType(); 15644 15645 if (N0.getOpcode() == ISD::AND && 15646 N0.hasOneUse() && 15647 N0.getOperand(0).hasOneUse()) { 15648 SDValue N00 = N0.getOperand(0); 15649 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 15650 return SDValue(); 15651 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 15652 if (!C || C->getZExtValue() != 1) 15653 return SDValue(); 15654 return DAG.getNode(ISD::AND, dl, VT, 15655 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 15656 N00.getOperand(0), N00.getOperand(1)), 15657 DAG.getConstant(1, VT)); 15658 } 15659 15660 // Optimize vectors in AVX mode: 15661 // 15662 // v8i16 -> v8i32 15663 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 15664 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 15665 // Concat upper and lower parts. 15666 // 15667 // v4i32 -> v4i64 15668 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 15669 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 15670 // Concat upper and lower parts. 15671 // 15672 if (!DCI.isBeforeLegalizeOps()) 15673 return SDValue(); 15674 15675 if (!Subtarget->hasAVX()) 15676 return SDValue(); 15677 15678 if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || 15679 ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { 15680 15681 if (Subtarget->hasAVX2()) 15682 return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0); 15683 15684 SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); 15685 SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec); 15686 SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec); 15687 15688 EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 15689 VT.getVectorNumElements()/2); 15690 15691 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); 15692 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); 15693 15694 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 15695 } 15696 15697 return SDValue(); 15698} 15699 15700// Optimize x == -y --> x+y == 0 15701// x != -y --> x+y != 0 15702static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { 15703 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 15704 SDValue LHS = N->getOperand(0); 15705 SDValue RHS = N->getOperand(1); 15706 15707 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) 15708 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) 15709 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { 15710 SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), 15711 LHS.getValueType(), RHS, LHS.getOperand(1)); 15712 return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), 15713 addV, DAG.getConstant(0, addV.getValueType()), CC); 15714 } 15715 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) 15716 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) 15717 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { 15718 SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), 15719 RHS.getValueType(), LHS, RHS.getOperand(1)); 15720 return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), 15721 addV, DAG.getConstant(0, addV.getValueType()), CC); 15722 } 15723 return SDValue(); 15724} 15725 15726// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 15727static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, 15728 TargetLowering::DAGCombinerInfo &DCI, 15729 const X86Subtarget *Subtarget) { 15730 DebugLoc DL = N->getDebugLoc(); 15731 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); 15732 SDValue EFLAGS = N->getOperand(1); 15733 15734 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 15735 // a zext and produces an all-ones bit which is more useful than 0/1 in some 15736 // cases. 15737 if (CC == X86::COND_B) 15738 return DAG.getNode(ISD::AND, DL, MVT::i8, 15739 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 15740 DAG.getConstant(CC, MVT::i8), EFLAGS), 15741 DAG.getConstant(1, MVT::i8)); 15742 15743 SDValue Flags; 15744 15745 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 15746 if (Flags.getNode()) { 15747 SDValue Cond = DAG.getConstant(CC, MVT::i8); 15748 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 15749 } 15750 15751 Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget); 15752 if (Flags.getNode()) { 15753 SDValue Cond = DAG.getConstant(CC, MVT::i8); 15754 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 15755 } 15756 15757 return SDValue(); 15758} 15759 15760// Optimize branch condition evaluation. 15761// 15762static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, 15763 TargetLowering::DAGCombinerInfo &DCI, 15764 const X86Subtarget *Subtarget) { 15765 DebugLoc DL = N->getDebugLoc(); 15766 SDValue Chain = N->getOperand(0); 15767 SDValue Dest = N->getOperand(1); 15768 SDValue EFLAGS = N->getOperand(3); 15769 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); 15770 15771 SDValue Flags; 15772 15773 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 15774 if (Flags.getNode()) { 15775 SDValue Cond = DAG.getConstant(CC, MVT::i8); 15776 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, 15777 Flags); 15778 } 15779 15780 Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget); 15781 if (Flags.getNode()) { 15782 SDValue Cond = DAG.getConstant(CC, MVT::i8); 15783 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, 15784 Flags); 15785 } 15786 15787 return SDValue(); 15788} 15789 15790static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) { 15791 SDValue Op0 = N->getOperand(0); 15792 EVT InVT = Op0->getValueType(0); 15793 15794 // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32)) 15795 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { 15796 DebugLoc dl = N->getDebugLoc(); 15797 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 15798 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); 15799 // Notice that we use SINT_TO_FP because we know that the high bits 15800 // are zero and SINT_TO_FP is better supported by the hardware. 15801 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); 15802 } 15803 15804 return SDValue(); 15805} 15806 15807static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 15808 const X86TargetLowering *XTLI) { 15809 SDValue Op0 = N->getOperand(0); 15810 EVT InVT = Op0->getValueType(0); 15811 15812 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) 15813 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { 15814 DebugLoc dl = N->getDebugLoc(); 15815 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 15816 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); 15817 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); 15818 } 15819 15820 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 15821 // a 32-bit target where SSE doesn't support i64->FP operations. 15822 if (Op0.getOpcode() == ISD::LOAD) { 15823 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 15824 EVT VT = Ld->getValueType(0); 15825 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 15826 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 15827 !XTLI->getSubtarget()->is64Bit() && 15828 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 15829 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 15830 Ld->getChain(), Op0, DAG); 15831 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 15832 return FILDChain; 15833 } 15834 } 15835 return SDValue(); 15836} 15837 15838static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) { 15839 EVT VT = N->getValueType(0); 15840 15841 // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT() 15842 if (VT == MVT::v8i8 || VT == MVT::v4i8) { 15843 DebugLoc dl = N->getDebugLoc(); 15844 MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 15845 SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0)); 15846 return DAG.getNode(ISD::TRUNCATE, dl, VT, I); 15847 } 15848 15849 return SDValue(); 15850} 15851 15852// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 15853static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 15854 X86TargetLowering::DAGCombinerInfo &DCI) { 15855 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 15856 // the result is either zero or one (depending on the input carry bit). 15857 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 15858 if (X86::isZeroNode(N->getOperand(0)) && 15859 X86::isZeroNode(N->getOperand(1)) && 15860 // We don't have a good way to replace an EFLAGS use, so only do this when 15861 // dead right now. 15862 SDValue(N, 1).use_empty()) { 15863 DebugLoc DL = N->getDebugLoc(); 15864 EVT VT = N->getValueType(0); 15865 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 15866 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 15867 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 15868 DAG.getConstant(X86::COND_B,MVT::i8), 15869 N->getOperand(2)), 15870 DAG.getConstant(1, VT)); 15871 return DCI.CombineTo(N, Res1, CarryOut); 15872 } 15873 15874 return SDValue(); 15875} 15876 15877// fold (add Y, (sete X, 0)) -> adc 0, Y 15878// (add Y, (setne X, 0)) -> sbb -1, Y 15879// (sub (sete X, 0), Y) -> sbb 0, Y 15880// (sub (setne X, 0), Y) -> adc -1, Y 15881static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 15882 DebugLoc DL = N->getDebugLoc(); 15883 15884 // Look through ZExts. 15885 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 15886 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 15887 return SDValue(); 15888 15889 SDValue SetCC = Ext.getOperand(0); 15890 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 15891 return SDValue(); 15892 15893 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 15894 if (CC != X86::COND_E && CC != X86::COND_NE) 15895 return SDValue(); 15896 15897 SDValue Cmp = SetCC.getOperand(1); 15898 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 15899 !X86::isZeroNode(Cmp.getOperand(1)) || 15900 !Cmp.getOperand(0).getValueType().isInteger()) 15901 return SDValue(); 15902 15903 SDValue CmpOp0 = Cmp.getOperand(0); 15904 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 15905 DAG.getConstant(1, CmpOp0.getValueType())); 15906 15907 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 15908 if (CC == X86::COND_NE) 15909 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 15910 DL, OtherVal.getValueType(), OtherVal, 15911 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 15912 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 15913 DL, OtherVal.getValueType(), OtherVal, 15914 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 15915} 15916 15917/// PerformADDCombine - Do target-specific dag combines on integer adds. 15918static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 15919 const X86Subtarget *Subtarget) { 15920 EVT VT = N->getValueType(0); 15921 SDValue Op0 = N->getOperand(0); 15922 SDValue Op1 = N->getOperand(1); 15923 15924 // Try to synthesize horizontal adds from adds of shuffles. 15925 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 15926 (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 15927 isHorizontalBinOp(Op0, Op1, true)) 15928 return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); 15929 15930 return OptimizeConditionalInDecrement(N, DAG); 15931} 15932 15933static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 15934 const X86Subtarget *Subtarget) { 15935 SDValue Op0 = N->getOperand(0); 15936 SDValue Op1 = N->getOperand(1); 15937 15938 // X86 can't encode an immediate LHS of a sub. See if we can push the 15939 // negation into a preceding instruction. 15940 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 15941 // If the RHS of the sub is a XOR with one use and a constant, invert the 15942 // immediate. Then add one to the LHS of the sub so we can turn 15943 // X-Y -> X+~Y+1, saving one register. 15944 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 15945 isa<ConstantSDNode>(Op1.getOperand(1))) { 15946 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 15947 EVT VT = Op0.getValueType(); 15948 SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, 15949 Op1.getOperand(0), 15950 DAG.getConstant(~XorC, VT)); 15951 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, 15952 DAG.getConstant(C->getAPIntValue()+1, VT)); 15953 } 15954 } 15955 15956 // Try to synthesize horizontal adds from adds of shuffles. 15957 EVT VT = N->getValueType(0); 15958 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 15959 (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 15960 isHorizontalBinOp(Op0, Op1, true)) 15961 return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); 15962 15963 return OptimizeConditionalInDecrement(N, DAG); 15964} 15965 15966SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 15967 DAGCombinerInfo &DCI) const { 15968 SelectionDAG &DAG = DCI.DAG; 15969 switch (N->getOpcode()) { 15970 default: break; 15971 case ISD::EXTRACT_VECTOR_ELT: 15972 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); 15973 case ISD::VSELECT: 15974 case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); 15975 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); 15976 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 15977 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 15978 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 15979 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 15980 case ISD::SHL: 15981 case ISD::SRA: 15982 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); 15983 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 15984 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 15985 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 15986 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); 15987 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 15988 case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG); 15989 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 15990 case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG); 15991 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 15992 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 15993 case X86ISD::FXOR: 15994 case X86ISD::FOR: return PerformFORCombine(N, DAG); 15995 case X86ISD::FMIN: 15996 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); 15997 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 15998 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 15999 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 16000 case ISD::ANY_EXTEND: 16001 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); 16002 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); 16003 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); 16004 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); 16005 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); 16006 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); 16007 case X86ISD::SHUFP: // Handle all target specific shuffles 16008 case X86ISD::PALIGN: 16009 case X86ISD::UNPCKH: 16010 case X86ISD::UNPCKL: 16011 case X86ISD::MOVHLPS: 16012 case X86ISD::MOVLHPS: 16013 case X86ISD::PSHUFD: 16014 case X86ISD::PSHUFHW: 16015 case X86ISD::PSHUFLW: 16016 case X86ISD::MOVSS: 16017 case X86ISD::MOVSD: 16018 case X86ISD::VPERMILP: 16019 case X86ISD::VPERM2X128: 16020 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 16021 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); 16022 } 16023 16024 return SDValue(); 16025} 16026 16027/// isTypeDesirableForOp - Return true if the target has native support for 16028/// the specified value type and it is 'desirable' to use the type for the 16029/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 16030/// instruction encodings are longer and some i16 instructions are slow. 16031bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 16032 if (!isTypeLegal(VT)) 16033 return false; 16034 if (VT != MVT::i16) 16035 return true; 16036 16037 switch (Opc) { 16038 default: 16039 return true; 16040 case ISD::LOAD: 16041 case ISD::SIGN_EXTEND: 16042 case ISD::ZERO_EXTEND: 16043 case ISD::ANY_EXTEND: 16044 case ISD::SHL: 16045 case ISD::SRL: 16046 case ISD::SUB: 16047 case ISD::ADD: 16048 case ISD::MUL: 16049 case ISD::AND: 16050 case ISD::OR: 16051 case ISD::XOR: 16052 return false; 16053 } 16054} 16055 16056/// IsDesirableToPromoteOp - This method query the target whether it is 16057/// beneficial for dag combiner to promote the specified node. If true, it 16058/// should return the desired promotion type by reference. 16059bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 16060 EVT VT = Op.getValueType(); 16061 if (VT != MVT::i16) 16062 return false; 16063 16064 bool Promote = false; 16065 bool Commute = false; 16066 switch (Op.getOpcode()) { 16067 default: break; 16068 case ISD::LOAD: { 16069 LoadSDNode *LD = cast<LoadSDNode>(Op); 16070 // If the non-extending load has a single use and it's not live out, then it 16071 // might be folded. 16072 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 16073 Op.hasOneUse()*/) { 16074 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 16075 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 16076 // The only case where we'd want to promote LOAD (rather then it being 16077 // promoted as an operand is when it's only use is liveout. 16078 if (UI->getOpcode() != ISD::CopyToReg) 16079 return false; 16080 } 16081 } 16082 Promote = true; 16083 break; 16084 } 16085 case ISD::SIGN_EXTEND: 16086 case ISD::ZERO_EXTEND: 16087 case ISD::ANY_EXTEND: 16088 Promote = true; 16089 break; 16090 case ISD::SHL: 16091 case ISD::SRL: { 16092 SDValue N0 = Op.getOperand(0); 16093 // Look out for (store (shl (load), x)). 16094 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 16095 return false; 16096 Promote = true; 16097 break; 16098 } 16099 case ISD::ADD: 16100 case ISD::MUL: 16101 case ISD::AND: 16102 case ISD::OR: 16103 case ISD::XOR: 16104 Commute = true; 16105 // fallthrough 16106 case ISD::SUB: { 16107 SDValue N0 = Op.getOperand(0); 16108 SDValue N1 = Op.getOperand(1); 16109 if (!Commute && MayFoldLoad(N1)) 16110 return false; 16111 // Avoid disabling potential load folding opportunities. 16112 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 16113 return false; 16114 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 16115 return false; 16116 Promote = true; 16117 } 16118 } 16119 16120 PVT = MVT::i32; 16121 return Promote; 16122} 16123 16124//===----------------------------------------------------------------------===// 16125// X86 Inline Assembly Support 16126//===----------------------------------------------------------------------===// 16127 16128namespace { 16129 // Helper to match a string separated by whitespace. 16130 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { 16131 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. 16132 16133 for (unsigned i = 0, e = args.size(); i != e; ++i) { 16134 StringRef piece(*args[i]); 16135 if (!s.startswith(piece)) // Check if the piece matches. 16136 return false; 16137 16138 s = s.substr(piece.size()); 16139 StringRef::size_type pos = s.find_first_not_of(" \t"); 16140 if (pos == 0) // We matched a prefix. 16141 return false; 16142 16143 s = s.substr(pos); 16144 } 16145 16146 return s.empty(); 16147 } 16148 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; 16149} 16150 16151bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 16152 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 16153 16154 std::string AsmStr = IA->getAsmString(); 16155 16156 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 16157 if (!Ty || Ty->getBitWidth() % 16 != 0) 16158 return false; 16159 16160 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 16161 SmallVector<StringRef, 4> AsmPieces; 16162 SplitString(AsmStr, AsmPieces, ";\n"); 16163 16164 switch (AsmPieces.size()) { 16165 default: return false; 16166 case 1: 16167 // FIXME: this should verify that we are targeting a 486 or better. If not, 16168 // we will turn this bswap into something that will be lowered to logical 16169 // ops instead of emitting the bswap asm. For now, we don't support 486 or 16170 // lower so don't worry about this. 16171 // bswap $0 16172 if (matchAsm(AsmPieces[0], "bswap", "$0") || 16173 matchAsm(AsmPieces[0], "bswapl", "$0") || 16174 matchAsm(AsmPieces[0], "bswapq", "$0") || 16175 matchAsm(AsmPieces[0], "bswap", "${0:q}") || 16176 matchAsm(AsmPieces[0], "bswapl", "${0:q}") || 16177 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { 16178 // No need to check constraints, nothing other than the equivalent of 16179 // "=r,0" would be valid here. 16180 return IntrinsicLowering::LowerToByteSwap(CI); 16181 } 16182 16183 // rorw $$8, ${0:w} --> llvm.bswap.i16 16184 if (CI->getType()->isIntegerTy(16) && 16185 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 16186 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || 16187 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { 16188 AsmPieces.clear(); 16189 const std::string &ConstraintsStr = IA->getConstraintString(); 16190 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 16191 std::sort(AsmPieces.begin(), AsmPieces.end()); 16192 if (AsmPieces.size() == 4 && 16193 AsmPieces[0] == "~{cc}" && 16194 AsmPieces[1] == "~{dirflag}" && 16195 AsmPieces[2] == "~{flags}" && 16196 AsmPieces[3] == "~{fpsr}") 16197 return IntrinsicLowering::LowerToByteSwap(CI); 16198 } 16199 break; 16200 case 3: 16201 if (CI->getType()->isIntegerTy(32) && 16202 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 16203 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && 16204 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && 16205 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { 16206 AsmPieces.clear(); 16207 const std::string &ConstraintsStr = IA->getConstraintString(); 16208 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 16209 std::sort(AsmPieces.begin(), AsmPieces.end()); 16210 if (AsmPieces.size() == 4 && 16211 AsmPieces[0] == "~{cc}" && 16212 AsmPieces[1] == "~{dirflag}" && 16213 AsmPieces[2] == "~{flags}" && 16214 AsmPieces[3] == "~{fpsr}") 16215 return IntrinsicLowering::LowerToByteSwap(CI); 16216 } 16217 16218 if (CI->getType()->isIntegerTy(64)) { 16219 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 16220 if (Constraints.size() >= 2 && 16221 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 16222 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 16223 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 16224 if (matchAsm(AsmPieces[0], "bswap", "%eax") && 16225 matchAsm(AsmPieces[1], "bswap", "%edx") && 16226 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) 16227 return IntrinsicLowering::LowerToByteSwap(CI); 16228 } 16229 } 16230 break; 16231 } 16232 return false; 16233} 16234 16235 16236 16237/// getConstraintType - Given a constraint letter, return the type of 16238/// constraint it is for this target. 16239X86TargetLowering::ConstraintType 16240X86TargetLowering::getConstraintType(const std::string &Constraint) const { 16241 if (Constraint.size() == 1) { 16242 switch (Constraint[0]) { 16243 case 'R': 16244 case 'q': 16245 case 'Q': 16246 case 'f': 16247 case 't': 16248 case 'u': 16249 case 'y': 16250 case 'x': 16251 case 'Y': 16252 case 'l': 16253 return C_RegisterClass; 16254 case 'a': 16255 case 'b': 16256 case 'c': 16257 case 'd': 16258 case 'S': 16259 case 'D': 16260 case 'A': 16261 return C_Register; 16262 case 'I': 16263 case 'J': 16264 case 'K': 16265 case 'L': 16266 case 'M': 16267 case 'N': 16268 case 'G': 16269 case 'C': 16270 case 'e': 16271 case 'Z': 16272 return C_Other; 16273 default: 16274 break; 16275 } 16276 } 16277 return TargetLowering::getConstraintType(Constraint); 16278} 16279 16280/// Examine constraint type and operand type and determine a weight value. 16281/// This object must already have been set up with the operand type 16282/// and the current alternative constraint selected. 16283TargetLowering::ConstraintWeight 16284 X86TargetLowering::getSingleConstraintMatchWeight( 16285 AsmOperandInfo &info, const char *constraint) const { 16286 ConstraintWeight weight = CW_Invalid; 16287 Value *CallOperandVal = info.CallOperandVal; 16288 // If we don't have a value, we can't do a match, 16289 // but allow it at the lowest weight. 16290 if (CallOperandVal == NULL) 16291 return CW_Default; 16292 Type *type = CallOperandVal->getType(); 16293 // Look at the constraint type. 16294 switch (*constraint) { 16295 default: 16296 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 16297 case 'R': 16298 case 'q': 16299 case 'Q': 16300 case 'a': 16301 case 'b': 16302 case 'c': 16303 case 'd': 16304 case 'S': 16305 case 'D': 16306 case 'A': 16307 if (CallOperandVal->getType()->isIntegerTy()) 16308 weight = CW_SpecificReg; 16309 break; 16310 case 'f': 16311 case 't': 16312 case 'u': 16313 if (type->isFloatingPointTy()) 16314 weight = CW_SpecificReg; 16315 break; 16316 case 'y': 16317 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 16318 weight = CW_SpecificReg; 16319 break; 16320 case 'x': 16321 case 'Y': 16322 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || 16323 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX())) 16324 weight = CW_Register; 16325 break; 16326 case 'I': 16327 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 16328 if (C->getZExtValue() <= 31) 16329 weight = CW_Constant; 16330 } 16331 break; 16332 case 'J': 16333 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16334 if (C->getZExtValue() <= 63) 16335 weight = CW_Constant; 16336 } 16337 break; 16338 case 'K': 16339 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16340 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 16341 weight = CW_Constant; 16342 } 16343 break; 16344 case 'L': 16345 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16346 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 16347 weight = CW_Constant; 16348 } 16349 break; 16350 case 'M': 16351 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16352 if (C->getZExtValue() <= 3) 16353 weight = CW_Constant; 16354 } 16355 break; 16356 case 'N': 16357 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16358 if (C->getZExtValue() <= 0xff) 16359 weight = CW_Constant; 16360 } 16361 break; 16362 case 'G': 16363 case 'C': 16364 if (dyn_cast<ConstantFP>(CallOperandVal)) { 16365 weight = CW_Constant; 16366 } 16367 break; 16368 case 'e': 16369 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16370 if ((C->getSExtValue() >= -0x80000000LL) && 16371 (C->getSExtValue() <= 0x7fffffffLL)) 16372 weight = CW_Constant; 16373 } 16374 break; 16375 case 'Z': 16376 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 16377 if (C->getZExtValue() <= 0xffffffff) 16378 weight = CW_Constant; 16379 } 16380 break; 16381 } 16382 return weight; 16383} 16384 16385/// LowerXConstraint - try to replace an X constraint, which matches anything, 16386/// with another that has more specific requirements based on the type of the 16387/// corresponding operand. 16388const char *X86TargetLowering:: 16389LowerXConstraint(EVT ConstraintVT) const { 16390 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 16391 // 'f' like normal targets. 16392 if (ConstraintVT.isFloatingPoint()) { 16393 if (Subtarget->hasSSE2()) 16394 return "Y"; 16395 if (Subtarget->hasSSE1()) 16396 return "x"; 16397 } 16398 16399 return TargetLowering::LowerXConstraint(ConstraintVT); 16400} 16401 16402/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 16403/// vector. If it is invalid, don't add anything to Ops. 16404void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 16405 std::string &Constraint, 16406 std::vector<SDValue>&Ops, 16407 SelectionDAG &DAG) const { 16408 SDValue Result(0, 0); 16409 16410 // Only support length 1 constraints for now. 16411 if (Constraint.length() > 1) return; 16412 16413 char ConstraintLetter = Constraint[0]; 16414 switch (ConstraintLetter) { 16415 default: break; 16416 case 'I': 16417 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16418 if (C->getZExtValue() <= 31) { 16419 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 16420 break; 16421 } 16422 } 16423 return; 16424 case 'J': 16425 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16426 if (C->getZExtValue() <= 63) { 16427 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 16428 break; 16429 } 16430 } 16431 return; 16432 case 'K': 16433 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16434 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 16435 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 16436 break; 16437 } 16438 } 16439 return; 16440 case 'N': 16441 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16442 if (C->getZExtValue() <= 255) { 16443 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 16444 break; 16445 } 16446 } 16447 return; 16448 case 'e': { 16449 // 32-bit signed value 16450 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16451 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 16452 C->getSExtValue())) { 16453 // Widen to 64 bits here to get it sign extended. 16454 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 16455 break; 16456 } 16457 // FIXME gcc accepts some relocatable values here too, but only in certain 16458 // memory models; it's complicated. 16459 } 16460 return; 16461 } 16462 case 'Z': { 16463 // 32-bit unsigned value 16464 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16465 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 16466 C->getZExtValue())) { 16467 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 16468 break; 16469 } 16470 } 16471 // FIXME gcc accepts some relocatable values here too, but only in certain 16472 // memory models; it's complicated. 16473 return; 16474 } 16475 case 'i': { 16476 // Literal immediates are always ok. 16477 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 16478 // Widen to 64 bits here to get it sign extended. 16479 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 16480 break; 16481 } 16482 16483 // In any sort of PIC mode addresses need to be computed at runtime by 16484 // adding in a register or some sort of table lookup. These can't 16485 // be used as immediates. 16486 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 16487 return; 16488 16489 // If we are in non-pic codegen mode, we allow the address of a global (with 16490 // an optional displacement) to be used with 'i'. 16491 GlobalAddressSDNode *GA = 0; 16492 int64_t Offset = 0; 16493 16494 // Match either (GA), (GA+C), (GA+C1+C2), etc. 16495 while (1) { 16496 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 16497 Offset += GA->getOffset(); 16498 break; 16499 } else if (Op.getOpcode() == ISD::ADD) { 16500 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 16501 Offset += C->getZExtValue(); 16502 Op = Op.getOperand(0); 16503 continue; 16504 } 16505 } else if (Op.getOpcode() == ISD::SUB) { 16506 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 16507 Offset += -C->getZExtValue(); 16508 Op = Op.getOperand(0); 16509 continue; 16510 } 16511 } 16512 16513 // Otherwise, this isn't something we can handle, reject it. 16514 return; 16515 } 16516 16517 const GlobalValue *GV = GA->getGlobal(); 16518 // If we require an extra load to get this address, as in PIC mode, we 16519 // can't accept it. 16520 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 16521 getTargetMachine()))) 16522 return; 16523 16524 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 16525 GA->getValueType(0), Offset); 16526 break; 16527 } 16528 } 16529 16530 if (Result.getNode()) { 16531 Ops.push_back(Result); 16532 return; 16533 } 16534 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 16535} 16536 16537std::pair<unsigned, const TargetRegisterClass*> 16538X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 16539 EVT VT) const { 16540 // First, see if this is a constraint that directly corresponds to an LLVM 16541 // register class. 16542 if (Constraint.size() == 1) { 16543 // GCC Constraint Letters 16544 switch (Constraint[0]) { 16545 default: break; 16546 // TODO: Slight differences here in allocation order and leaving 16547 // RIP in the class. Do they matter any more here than they do 16548 // in the normal allocation? 16549 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 16550 if (Subtarget->is64Bit()) { 16551 if (VT == MVT::i32 || VT == MVT::f32) 16552 return std::make_pair(0U, &X86::GR32RegClass); 16553 if (VT == MVT::i16) 16554 return std::make_pair(0U, &X86::GR16RegClass); 16555 if (VT == MVT::i8 || VT == MVT::i1) 16556 return std::make_pair(0U, &X86::GR8RegClass); 16557 if (VT == MVT::i64 || VT == MVT::f64) 16558 return std::make_pair(0U, &X86::GR64RegClass); 16559 break; 16560 } 16561 // 32-bit fallthrough 16562 case 'Q': // Q_REGS 16563 if (VT == MVT::i32 || VT == MVT::f32) 16564 return std::make_pair(0U, &X86::GR32_ABCDRegClass); 16565 if (VT == MVT::i16) 16566 return std::make_pair(0U, &X86::GR16_ABCDRegClass); 16567 if (VT == MVT::i8 || VT == MVT::i1) 16568 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); 16569 if (VT == MVT::i64) 16570 return std::make_pair(0U, &X86::GR64_ABCDRegClass); 16571 break; 16572 case 'r': // GENERAL_REGS 16573 case 'l': // INDEX_REGS 16574 if (VT == MVT::i8 || VT == MVT::i1) 16575 return std::make_pair(0U, &X86::GR8RegClass); 16576 if (VT == MVT::i16) 16577 return std::make_pair(0U, &X86::GR16RegClass); 16578 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 16579 return std::make_pair(0U, &X86::GR32RegClass); 16580 return std::make_pair(0U, &X86::GR64RegClass); 16581 case 'R': // LEGACY_REGS 16582 if (VT == MVT::i8 || VT == MVT::i1) 16583 return std::make_pair(0U, &X86::GR8_NOREXRegClass); 16584 if (VT == MVT::i16) 16585 return std::make_pair(0U, &X86::GR16_NOREXRegClass); 16586 if (VT == MVT::i32 || !Subtarget->is64Bit()) 16587 return std::make_pair(0U, &X86::GR32_NOREXRegClass); 16588 return std::make_pair(0U, &X86::GR64_NOREXRegClass); 16589 case 'f': // FP Stack registers. 16590 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 16591 // value to the correct fpstack register class. 16592 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 16593 return std::make_pair(0U, &X86::RFP32RegClass); 16594 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 16595 return std::make_pair(0U, &X86::RFP64RegClass); 16596 return std::make_pair(0U, &X86::RFP80RegClass); 16597 case 'y': // MMX_REGS if MMX allowed. 16598 if (!Subtarget->hasMMX()) break; 16599 return std::make_pair(0U, &X86::VR64RegClass); 16600 case 'Y': // SSE_REGS if SSE2 allowed 16601 if (!Subtarget->hasSSE2()) break; 16602 // FALL THROUGH. 16603 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 16604 if (!Subtarget->hasSSE1()) break; 16605 16606 switch (VT.getSimpleVT().SimpleTy) { 16607 default: break; 16608 // Scalar SSE types. 16609 case MVT::f32: 16610 case MVT::i32: 16611 return std::make_pair(0U, &X86::FR32RegClass); 16612 case MVT::f64: 16613 case MVT::i64: 16614 return std::make_pair(0U, &X86::FR64RegClass); 16615 // Vector types. 16616 case MVT::v16i8: 16617 case MVT::v8i16: 16618 case MVT::v4i32: 16619 case MVT::v2i64: 16620 case MVT::v4f32: 16621 case MVT::v2f64: 16622 return std::make_pair(0U, &X86::VR128RegClass); 16623 // AVX types. 16624 case MVT::v32i8: 16625 case MVT::v16i16: 16626 case MVT::v8i32: 16627 case MVT::v4i64: 16628 case MVT::v8f32: 16629 case MVT::v4f64: 16630 return std::make_pair(0U, &X86::VR256RegClass); 16631 } 16632 break; 16633 } 16634 } 16635 16636 // Use the default implementation in TargetLowering to convert the register 16637 // constraint into a member of a register class. 16638 std::pair<unsigned, const TargetRegisterClass*> Res; 16639 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 16640 16641 // Not found as a standard register? 16642 if (Res.second == 0) { 16643 // Map st(0) -> st(7) -> ST0 16644 if (Constraint.size() == 7 && Constraint[0] == '{' && 16645 tolower(Constraint[1]) == 's' && 16646 tolower(Constraint[2]) == 't' && 16647 Constraint[3] == '(' && 16648 (Constraint[4] >= '0' && Constraint[4] <= '7') && 16649 Constraint[5] == ')' && 16650 Constraint[6] == '}') { 16651 16652 Res.first = X86::ST0+Constraint[4]-'0'; 16653 Res.second = &X86::RFP80RegClass; 16654 return Res; 16655 } 16656 16657 // GCC allows "st(0)" to be called just plain "st". 16658 if (StringRef("{st}").equals_lower(Constraint)) { 16659 Res.first = X86::ST0; 16660 Res.second = &X86::RFP80RegClass; 16661 return Res; 16662 } 16663 16664 // flags -> EFLAGS 16665 if (StringRef("{flags}").equals_lower(Constraint)) { 16666 Res.first = X86::EFLAGS; 16667 Res.second = &X86::CCRRegClass; 16668 return Res; 16669 } 16670 16671 // 'A' means EAX + EDX. 16672 if (Constraint == "A") { 16673 Res.first = X86::EAX; 16674 Res.second = &X86::GR32_ADRegClass; 16675 return Res; 16676 } 16677 return Res; 16678 } 16679 16680 // Otherwise, check to see if this is a register class of the wrong value 16681 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 16682 // turn into {ax},{dx}. 16683 if (Res.second->hasType(VT)) 16684 return Res; // Correct type already, nothing to do. 16685 16686 // All of the single-register GCC register classes map their values onto 16687 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 16688 // really want an 8-bit or 32-bit register, map to the appropriate register 16689 // class and return the appropriate register. 16690 if (Res.second == &X86::GR16RegClass) { 16691 if (VT == MVT::i8) { 16692 unsigned DestReg = 0; 16693 switch (Res.first) { 16694 default: break; 16695 case X86::AX: DestReg = X86::AL; break; 16696 case X86::DX: DestReg = X86::DL; break; 16697 case X86::CX: DestReg = X86::CL; break; 16698 case X86::BX: DestReg = X86::BL; break; 16699 } 16700 if (DestReg) { 16701 Res.first = DestReg; 16702 Res.second = &X86::GR8RegClass; 16703 } 16704 } else if (VT == MVT::i32) { 16705 unsigned DestReg = 0; 16706 switch (Res.first) { 16707 default: break; 16708 case X86::AX: DestReg = X86::EAX; break; 16709 case X86::DX: DestReg = X86::EDX; break; 16710 case X86::CX: DestReg = X86::ECX; break; 16711 case X86::BX: DestReg = X86::EBX; break; 16712 case X86::SI: DestReg = X86::ESI; break; 16713 case X86::DI: DestReg = X86::EDI; break; 16714 case X86::BP: DestReg = X86::EBP; break; 16715 case X86::SP: DestReg = X86::ESP; break; 16716 } 16717 if (DestReg) { 16718 Res.first = DestReg; 16719 Res.second = &X86::GR32RegClass; 16720 } 16721 } else if (VT == MVT::i64) { 16722 unsigned DestReg = 0; 16723 switch (Res.first) { 16724 default: break; 16725 case X86::AX: DestReg = X86::RAX; break; 16726 case X86::DX: DestReg = X86::RDX; break; 16727 case X86::CX: DestReg = X86::RCX; break; 16728 case X86::BX: DestReg = X86::RBX; break; 16729 case X86::SI: DestReg = X86::RSI; break; 16730 case X86::DI: DestReg = X86::RDI; break; 16731 case X86::BP: DestReg = X86::RBP; break; 16732 case X86::SP: DestReg = X86::RSP; break; 16733 } 16734 if (DestReg) { 16735 Res.first = DestReg; 16736 Res.second = &X86::GR64RegClass; 16737 } 16738 } 16739 } else if (Res.second == &X86::FR32RegClass || 16740 Res.second == &X86::FR64RegClass || 16741 Res.second == &X86::VR128RegClass) { 16742 // Handle references to XMM physical registers that got mapped into the 16743 // wrong class. This can happen with constraints like {xmm0} where the 16744 // target independent register mapper will just pick the first match it can 16745 // find, ignoring the required type. 16746 16747 if (VT == MVT::f32 || VT == MVT::i32) 16748 Res.second = &X86::FR32RegClass; 16749 else if (VT == MVT::f64 || VT == MVT::i64) 16750 Res.second = &X86::FR64RegClass; 16751 else if (X86::VR128RegClass.hasType(VT)) 16752 Res.second = &X86::VR128RegClass; 16753 else if (X86::VR256RegClass.hasType(VT)) 16754 Res.second = &X86::VR256RegClass; 16755 } 16756 16757 return Res; 16758} 16759