X86ISelLowering.cpp revision c5eaae4e9bc75b203b3a9922b480729bc4f340e2
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86ISelLowering.h" 17#include "X86.h" 18#include "X86InstrBuilder.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/MC/MCAsmInfo.h" 39#include "llvm/MC/MCContext.h" 40#include "llvm/MC/MCExpr.h" 41#include "llvm/MC/MCSymbol.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VariadicFunction.h" 46#include "llvm/Support/CallSite.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/ErrorHandling.h" 49#include "llvm/Support/MathExtras.h" 50#include "llvm/Target/TargetOptions.h" 51#include <bitset> 52using namespace llvm; 53 54STATISTIC(NumTailCalls, "Number of tail calls"); 55 56// Forward declarations. 57static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 58 SDValue V2); 59 60/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 61/// sets things up to match to an AVX VEXTRACTF128 instruction or a 62/// simple subregister reference. Idx is an index in the 128 bits we 63/// want. It need not be aligned to a 128-bit bounday. That makes 64/// lowering EXTRACT_VECTOR_ELT operations easier. 65static SDValue Extract128BitVector(SDValue Vec, 66 SDValue Idx, 67 SelectionDAG &DAG, 68 DebugLoc dl) { 69 EVT VT = Vec.getValueType(); 70 assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); 71 EVT ElVT = VT.getVectorElementType(); 72 int Factor = VT.getSizeInBits()/128; 73 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 74 VT.getVectorNumElements()/Factor); 75 76 // Extract from UNDEF is UNDEF. 77 if (Vec.getOpcode() == ISD::UNDEF) 78 return DAG.getNode(ISD::UNDEF, dl, ResultVT); 79 80 if (isa<ConstantSDNode>(Idx)) { 81 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 82 83 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 84 // we can match to VEXTRACTF128. 85 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 86 87 // This is the index of the first element of the 128-bit chunk 88 // we want. 89 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 90 * ElemsPerChunk); 91 92 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 93 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 94 VecIdx); 95 96 return Result; 97 } 98 99 return SDValue(); 100} 101 102/// Generate a DAG to put 128-bits into a vector > 128 bits. This 103/// sets things up to match to an AVX VINSERTF128 instruction or a 104/// simple superregister reference. Idx is an index in the 128 bits 105/// we want. It need not be aligned to a 128-bit bounday. That makes 106/// lowering INSERT_VECTOR_ELT operations easier. 107static SDValue Insert128BitVector(SDValue Result, 108 SDValue Vec, 109 SDValue Idx, 110 SelectionDAG &DAG, 111 DebugLoc dl) { 112 if (isa<ConstantSDNode>(Idx)) { 113 EVT VT = Vec.getValueType(); 114 assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); 115 116 EVT ElVT = VT.getVectorElementType(); 117 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 118 EVT ResultVT = Result.getValueType(); 119 120 // Insert the relevant 128 bits. 121 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 122 123 // This is the index of the first element of the 128-bit chunk 124 // we want. 125 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 126 * ElemsPerChunk); 127 128 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 129 Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 130 VecIdx); 131 return Result; 132 } 133 134 return SDValue(); 135} 136 137static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 138 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 139 bool is64Bit = Subtarget->is64Bit(); 140 141 if (Subtarget->isTargetEnvMacho()) { 142 if (is64Bit) 143 return new X8664_MachoTargetObjectFile(); 144 return new TargetLoweringObjectFileMachO(); 145 } 146 147 if (Subtarget->isTargetELF()) 148 return new TargetLoweringObjectFileELF(); 149 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 150 return new TargetLoweringObjectFileCOFF(); 151 llvm_unreachable("unknown subtarget type"); 152} 153 154X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 155 : TargetLowering(TM, createTLOF(TM)) { 156 Subtarget = &TM.getSubtarget<X86Subtarget>(); 157 X86ScalarSSEf64 = Subtarget->hasSSE2(); 158 X86ScalarSSEf32 = Subtarget->hasSSE1(); 159 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 160 161 RegInfo = TM.getRegisterInfo(); 162 TD = getTargetData(); 163 164 // Set up the TargetLowering object. 165 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 166 167 // X86 is weird, it always uses i8 for shift amounts and setcc results. 168 setBooleanContents(ZeroOrOneBooleanContent); 169 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 170 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 171 172 // For 64-bit since we have so many registers use the ILP scheduler, for 173 // 32-bit code use the register pressure specific scheduling. 174 // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling. 175 if (Subtarget->is64Bit()) 176 setSchedulingPreference(Sched::ILP); 177 else if (Subtarget->isAtom()) 178 setSchedulingPreference(Sched::Hybrid); 179 else 180 setSchedulingPreference(Sched::RegPressure); 181 setStackPointerRegisterToSaveRestore(X86StackPtr); 182 183 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 184 // Setup Windows compiler runtime calls. 185 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 186 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 187 setLibcallName(RTLIB::SREM_I64, "_allrem"); 188 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 189 setLibcallName(RTLIB::MUL_I64, "_allmul"); 190 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 191 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 192 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 193 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 194 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 195 196 // The _ftol2 runtime function has an unusual calling conv, which 197 // is modeled by a special pseudo-instruction. 198 setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); 199 setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); 200 setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); 201 setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); 202 } 203 204 if (Subtarget->isTargetDarwin()) { 205 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 206 setUseUnderscoreSetJmp(false); 207 setUseUnderscoreLongJmp(false); 208 } else if (Subtarget->isTargetMingw()) { 209 // MS runtime is weird: it exports _setjmp, but longjmp! 210 setUseUnderscoreSetJmp(true); 211 setUseUnderscoreLongJmp(false); 212 } else { 213 setUseUnderscoreSetJmp(true); 214 setUseUnderscoreLongJmp(true); 215 } 216 217 // Set up the register classes. 218 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 219 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 220 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 221 if (Subtarget->is64Bit()) 222 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 223 224 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 225 226 // We don't accept any truncstore of integer registers. 227 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 228 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 229 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 230 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 231 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 232 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 233 234 // SETOEQ and SETUNE require checking two conditions. 235 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 236 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 237 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 238 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 239 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 240 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 241 242 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 243 // operation. 244 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 245 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 246 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 247 248 if (Subtarget->is64Bit()) { 249 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 250 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 251 } else if (!TM.Options.UseSoftFloat) { 252 // We have an algorithm for SSE2->double, and we turn this into a 253 // 64-bit FILD followed by conditional FADD for other targets. 254 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 255 // We have an algorithm for SSE2, and we turn this into a 64-bit 256 // FILD for other targets. 257 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 258 } 259 260 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 261 // this operation. 262 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 263 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 264 265 if (!TM.Options.UseSoftFloat) { 266 // SSE has no i16 to fp conversion, only i32 267 if (X86ScalarSSEf32) { 268 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 269 // f32 and f64 cases are Legal, f80 case is not 270 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 271 } else { 272 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 273 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 274 } 275 } else { 276 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 277 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 278 } 279 280 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 281 // are Legal, f80 is custom lowered. 282 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 283 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 284 285 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 286 // this operation. 287 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 288 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 289 290 if (X86ScalarSSEf32) { 291 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 292 // f32 and f64 cases are Legal, f80 case is not 293 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 294 } else { 295 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 296 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 297 } 298 299 // Handle FP_TO_UINT by promoting the destination to a larger signed 300 // conversion. 301 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 302 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 303 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 304 305 if (Subtarget->is64Bit()) { 306 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 307 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 308 } else if (!TM.Options.UseSoftFloat) { 309 // Since AVX is a superset of SSE3, only check for SSE here. 310 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 311 // Expand FP_TO_UINT into a select. 312 // FIXME: We would like to use a Custom expander here eventually to do 313 // the optimal thing for SSE vs. the default expansion in the legalizer. 314 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 315 else 316 // With SSE3 we can use fisttpll to convert to a signed i64; without 317 // SSE, we're stuck with a fistpll. 318 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 319 } 320 321 if (isTargetFTOL()) { 322 // Use the _ftol2 runtime function, which has a pseudo-instruction 323 // to handle its weird calling convention. 324 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 325 } 326 327 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 328 if (!X86ScalarSSEf64) { 329 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 330 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 331 if (Subtarget->is64Bit()) { 332 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 333 // Without SSE, i64->f64 goes through memory. 334 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 335 } 336 } 337 338 // Scalar integer divide and remainder are lowered to use operations that 339 // produce two results, to match the available instructions. This exposes 340 // the two-result form to trivial CSE, which is able to combine x/y and x%y 341 // into a single instruction. 342 // 343 // Scalar integer multiply-high is also lowered to use two-result 344 // operations, to match the available instructions. However, plain multiply 345 // (low) operations are left as Legal, as there are single-result 346 // instructions for this in x86. Using the two-result multiply instructions 347 // when both high and low results are needed must be arranged by dagcombine. 348 for (unsigned i = 0, e = 4; i != e; ++i) { 349 MVT VT = IntVTs[i]; 350 setOperationAction(ISD::MULHS, VT, Expand); 351 setOperationAction(ISD::MULHU, VT, Expand); 352 setOperationAction(ISD::SDIV, VT, Expand); 353 setOperationAction(ISD::UDIV, VT, Expand); 354 setOperationAction(ISD::SREM, VT, Expand); 355 setOperationAction(ISD::UREM, VT, Expand); 356 357 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 358 setOperationAction(ISD::ADDC, VT, Custom); 359 setOperationAction(ISD::ADDE, VT, Custom); 360 setOperationAction(ISD::SUBC, VT, Custom); 361 setOperationAction(ISD::SUBE, VT, Custom); 362 } 363 364 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 365 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 366 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 367 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 368 if (Subtarget->is64Bit()) 369 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 370 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 371 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 372 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 373 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 374 setOperationAction(ISD::FREM , MVT::f32 , Expand); 375 setOperationAction(ISD::FREM , MVT::f64 , Expand); 376 setOperationAction(ISD::FREM , MVT::f80 , Expand); 377 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 378 379 // Promote the i8 variants and force them on up to i32 which has a shorter 380 // encoding. 381 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 382 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 383 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 384 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 385 if (Subtarget->hasBMI()) { 386 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 387 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 388 if (Subtarget->is64Bit()) 389 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 390 } else { 391 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 392 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 393 if (Subtarget->is64Bit()) 394 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 395 } 396 397 if (Subtarget->hasLZCNT()) { 398 // When promoting the i8 variants, force them to i32 for a shorter 399 // encoding. 400 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 401 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 402 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 403 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 404 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 405 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 406 if (Subtarget->is64Bit()) 407 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 408 } else { 409 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 410 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 411 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 412 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 413 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 414 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 415 if (Subtarget->is64Bit()) { 416 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 417 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 418 } 419 } 420 421 if (Subtarget->hasPOPCNT()) { 422 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 423 } else { 424 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 425 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 426 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 427 if (Subtarget->is64Bit()) 428 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 429 } 430 431 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 432 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 433 434 // These should be promoted to a larger select which is supported. 435 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 436 // X86 wants to expand cmov itself. 437 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 438 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 439 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 440 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 441 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 442 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 443 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 444 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 445 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 446 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 447 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 448 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 449 if (Subtarget->is64Bit()) { 450 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 451 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 452 } 453 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 454 455 // Darwin ABI issue. 456 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 457 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 458 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 459 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 460 if (Subtarget->is64Bit()) 461 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 462 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 463 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 464 if (Subtarget->is64Bit()) { 465 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 466 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 467 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 468 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 469 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 470 } 471 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 472 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 473 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 474 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 475 if (Subtarget->is64Bit()) { 476 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 477 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 478 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 479 } 480 481 if (Subtarget->hasSSE1()) 482 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 483 484 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 485 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 486 487 // On X86 and X86-64, atomic operations are lowered to locked instructions. 488 // Locked instructions, in turn, have implicit fence semantics (all memory 489 // operations are flushed before issuing the locked instruction, and they 490 // are not buffered), so we can fold away the common pattern of 491 // fence-atomic-fence. 492 setShouldFoldAtomicFences(true); 493 494 // Expand certain atomics 495 for (unsigned i = 0, e = 4; i != e; ++i) { 496 MVT VT = IntVTs[i]; 497 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 498 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 499 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 500 } 501 502 if (!Subtarget->is64Bit()) { 503 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); 504 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 505 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 506 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 507 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 508 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 509 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 510 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 511 } 512 513 if (Subtarget->hasCmpxchg16b()) { 514 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 515 } 516 517 // FIXME - use subtarget debug flags 518 if (!Subtarget->isTargetDarwin() && 519 !Subtarget->isTargetELF() && 520 !Subtarget->isTargetCygMing()) { 521 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 522 } 523 524 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 525 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 526 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 527 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 528 if (Subtarget->is64Bit()) { 529 setExceptionPointerRegister(X86::RAX); 530 setExceptionSelectorRegister(X86::RDX); 531 } else { 532 setExceptionPointerRegister(X86::EAX); 533 setExceptionSelectorRegister(X86::EDX); 534 } 535 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 536 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 537 538 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 539 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 540 541 setOperationAction(ISD::TRAP, MVT::Other, Legal); 542 543 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 544 setOperationAction(ISD::VASTART , MVT::Other, Custom); 545 setOperationAction(ISD::VAEND , MVT::Other, Expand); 546 if (Subtarget->is64Bit()) { 547 setOperationAction(ISD::VAARG , MVT::Other, Custom); 548 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 549 } else { 550 setOperationAction(ISD::VAARG , MVT::Other, Expand); 551 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 552 } 553 554 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 555 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 556 557 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 558 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 559 MVT::i64 : MVT::i32, Custom); 560 else if (TM.Options.EnableSegmentedStacks) 561 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 562 MVT::i64 : MVT::i32, Custom); 563 else 564 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 565 MVT::i64 : MVT::i32, Expand); 566 567 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 568 // f32 and f64 use SSE. 569 // Set up the FP register classes. 570 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 571 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 572 573 // Use ANDPD to simulate FABS. 574 setOperationAction(ISD::FABS , MVT::f64, Custom); 575 setOperationAction(ISD::FABS , MVT::f32, Custom); 576 577 // Use XORP to simulate FNEG. 578 setOperationAction(ISD::FNEG , MVT::f64, Custom); 579 setOperationAction(ISD::FNEG , MVT::f32, Custom); 580 581 // Use ANDPD and ORPD to simulate FCOPYSIGN. 582 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 583 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 584 585 // Lower this to FGETSIGNx86 plus an AND. 586 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 587 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 588 589 // We don't support sin/cos/fmod 590 setOperationAction(ISD::FSIN , MVT::f64, Expand); 591 setOperationAction(ISD::FCOS , MVT::f64, Expand); 592 setOperationAction(ISD::FSIN , MVT::f32, Expand); 593 setOperationAction(ISD::FCOS , MVT::f32, Expand); 594 595 // Expand FP immediates into loads from the stack, except for the special 596 // cases we handle. 597 addLegalFPImmediate(APFloat(+0.0)); // xorpd 598 addLegalFPImmediate(APFloat(+0.0f)); // xorps 599 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 600 // Use SSE for f32, x87 for f64. 601 // Set up the FP register classes. 602 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 603 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 604 605 // Use ANDPS to simulate FABS. 606 setOperationAction(ISD::FABS , MVT::f32, Custom); 607 608 // Use XORP to simulate FNEG. 609 setOperationAction(ISD::FNEG , MVT::f32, Custom); 610 611 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 612 613 // Use ANDPS and ORPS to simulate FCOPYSIGN. 614 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 615 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 616 617 // We don't support sin/cos/fmod 618 setOperationAction(ISD::FSIN , MVT::f32, Expand); 619 setOperationAction(ISD::FCOS , MVT::f32, Expand); 620 621 // Special cases we handle for FP constants. 622 addLegalFPImmediate(APFloat(+0.0f)); // xorps 623 addLegalFPImmediate(APFloat(+0.0)); // FLD0 624 addLegalFPImmediate(APFloat(+1.0)); // FLD1 625 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 626 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 627 628 if (!TM.Options.UnsafeFPMath) { 629 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 630 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 631 } 632 } else if (!TM.Options.UseSoftFloat) { 633 // f32 and f64 in x87. 634 // Set up the FP register classes. 635 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 636 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 637 638 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 639 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 640 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 641 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 642 643 if (!TM.Options.UnsafeFPMath) { 644 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 645 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 646 } 647 addLegalFPImmediate(APFloat(+0.0)); // FLD0 648 addLegalFPImmediate(APFloat(+1.0)); // FLD1 649 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 650 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 651 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 652 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 653 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 654 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 655 } 656 657 // We don't support FMA. 658 setOperationAction(ISD::FMA, MVT::f64, Expand); 659 setOperationAction(ISD::FMA, MVT::f32, Expand); 660 661 // Long double always uses X87. 662 if (!TM.Options.UseSoftFloat) { 663 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 664 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 665 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 666 { 667 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 668 addLegalFPImmediate(TmpFlt); // FLD0 669 TmpFlt.changeSign(); 670 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 671 672 bool ignored; 673 APFloat TmpFlt2(+1.0); 674 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 675 &ignored); 676 addLegalFPImmediate(TmpFlt2); // FLD1 677 TmpFlt2.changeSign(); 678 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 679 } 680 681 if (!TM.Options.UnsafeFPMath) { 682 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 683 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 684 } 685 686 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 687 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 688 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 689 setOperationAction(ISD::FRINT, MVT::f80, Expand); 690 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 691 setOperationAction(ISD::FMA, MVT::f80, Expand); 692 } 693 694 // Always use a library call for pow. 695 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 696 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 697 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 698 699 setOperationAction(ISD::FLOG, MVT::f80, Expand); 700 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 701 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 702 setOperationAction(ISD::FEXP, MVT::f80, Expand); 703 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 704 705 // First set operation action for all vector types to either promote 706 // (for widening) or expand (for scalarization). Then we will selectively 707 // turn on ones that can be effectively codegen'd. 708 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 709 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 710 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 711 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 712 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 713 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 714 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 715 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 716 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 717 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 718 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 719 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 720 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 721 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 722 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 723 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 724 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 725 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 726 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 727 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 728 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 729 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 730 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 731 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 732 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 733 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 734 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 735 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 736 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 737 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 738 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 739 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 740 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 741 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 742 setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); 743 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 744 setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); 745 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 746 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 747 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 748 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 749 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 750 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 751 setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand); 752 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 753 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 754 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 755 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 756 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 757 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 758 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 759 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 760 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 761 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 762 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 763 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 764 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 765 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 766 setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand); 767 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 768 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 769 setTruncStoreAction((MVT::SimpleValueType)VT, 770 (MVT::SimpleValueType)InnerVT, Expand); 771 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 772 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 773 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 774 } 775 776 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 777 // with -msoft-float, disable use of MMX as well. 778 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 779 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 780 // No operations on x86mmx supported, everything uses intrinsics. 781 } 782 783 // MMX-sized vectors (other than x86mmx) are expected to be expanded 784 // into smaller operations. 785 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 786 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 787 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 788 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 789 setOperationAction(ISD::AND, MVT::v8i8, Expand); 790 setOperationAction(ISD::AND, MVT::v4i16, Expand); 791 setOperationAction(ISD::AND, MVT::v2i32, Expand); 792 setOperationAction(ISD::AND, MVT::v1i64, Expand); 793 setOperationAction(ISD::OR, MVT::v8i8, Expand); 794 setOperationAction(ISD::OR, MVT::v4i16, Expand); 795 setOperationAction(ISD::OR, MVT::v2i32, Expand); 796 setOperationAction(ISD::OR, MVT::v1i64, Expand); 797 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 798 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 799 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 800 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 801 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 802 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 803 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 804 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 805 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 806 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 807 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 808 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 809 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 810 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 811 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 812 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 813 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 814 815 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { 816 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 817 818 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 819 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 820 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 821 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 822 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 823 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 824 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 825 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 826 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 827 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 828 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 829 setOperationAction(ISD::SETCC, MVT::v4f32, Custom); 830 } 831 832 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { 833 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 834 835 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 836 // registers cannot be used even for integer operations. 837 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 838 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 839 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 840 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 841 842 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 843 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 844 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 845 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 846 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 847 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 848 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 849 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 850 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 851 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 852 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 853 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 854 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 855 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 856 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 857 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 858 859 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 860 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 861 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 862 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 863 864 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 865 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 866 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 867 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 868 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 869 870 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 871 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 872 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 873 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 874 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 875 876 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 877 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 878 EVT VT = (MVT::SimpleValueType)i; 879 // Do not attempt to custom lower non-power-of-2 vectors 880 if (!isPowerOf2_32(VT.getVectorNumElements())) 881 continue; 882 // Do not attempt to custom lower non-128-bit vectors 883 if (!VT.is128BitVector()) 884 continue; 885 setOperationAction(ISD::BUILD_VECTOR, 886 VT.getSimpleVT().SimpleTy, Custom); 887 setOperationAction(ISD::VECTOR_SHUFFLE, 888 VT.getSimpleVT().SimpleTy, Custom); 889 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 890 VT.getSimpleVT().SimpleTy, Custom); 891 } 892 893 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 894 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 895 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 896 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 897 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 898 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 899 900 if (Subtarget->is64Bit()) { 901 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 902 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 903 } 904 905 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 906 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 907 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 908 EVT VT = SVT; 909 910 // Do not attempt to promote non-128-bit vectors 911 if (!VT.is128BitVector()) 912 continue; 913 914 setOperationAction(ISD::AND, SVT, Promote); 915 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 916 setOperationAction(ISD::OR, SVT, Promote); 917 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 918 setOperationAction(ISD::XOR, SVT, Promote); 919 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 920 setOperationAction(ISD::LOAD, SVT, Promote); 921 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 922 setOperationAction(ISD::SELECT, SVT, Promote); 923 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 924 } 925 926 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 927 928 // Custom lower v2i64 and v2f64 selects. 929 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 930 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 931 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 932 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 933 934 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 935 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 936 } 937 938 if (Subtarget->hasSSE41()) { 939 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 940 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 941 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 942 setOperationAction(ISD::FRINT, MVT::f32, Legal); 943 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 944 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 945 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 946 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 947 setOperationAction(ISD::FRINT, MVT::f64, Legal); 948 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 949 950 // FIXME: Do we need to handle scalar-to-vector here? 951 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 952 953 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 954 setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); 955 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 956 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 957 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 958 959 // i8 and i16 vectors are custom , because the source register and source 960 // source memory operand types are not the same width. f32 vectors are 961 // custom since the immediate controlling the insert encodes additional 962 // information. 963 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 964 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 965 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 966 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 967 968 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 969 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 970 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 971 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 972 973 // FIXME: these should be Legal but thats only for the case where 974 // the index is constant. For now custom expand to deal with that. 975 if (Subtarget->is64Bit()) { 976 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 977 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 978 } 979 } 980 981 if (Subtarget->hasSSE2()) { 982 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 983 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 984 985 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 986 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 987 988 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 989 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 990 991 if (Subtarget->hasAVX2()) { 992 setOperationAction(ISD::SRL, MVT::v2i64, Legal); 993 setOperationAction(ISD::SRL, MVT::v4i32, Legal); 994 995 setOperationAction(ISD::SHL, MVT::v2i64, Legal); 996 setOperationAction(ISD::SHL, MVT::v4i32, Legal); 997 998 setOperationAction(ISD::SRA, MVT::v4i32, Legal); 999 } else { 1000 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 1001 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 1002 1003 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 1004 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 1005 1006 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 1007 } 1008 } 1009 1010 if (Subtarget->hasSSE42()) 1011 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 1012 1013 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { 1014 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 1015 addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); 1016 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 1017 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 1018 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 1019 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 1020 1021 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1022 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1023 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1024 1025 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1026 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1027 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1028 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1029 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1030 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1031 1032 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1033 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1034 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1035 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1036 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1037 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1038 1039 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1040 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1041 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1042 1043 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom); 1044 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom); 1045 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 1046 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 1047 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); 1048 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); 1049 1050 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1051 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1052 1053 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1054 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1055 1056 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1057 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1058 1059 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1060 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1061 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1062 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1063 1064 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1065 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1066 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1067 1068 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 1069 setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); 1070 setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); 1071 setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); 1072 1073 if (Subtarget->hasAVX2()) { 1074 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1075 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1076 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1077 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1078 1079 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1080 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1081 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1082 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1083 1084 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1085 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1086 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1087 // Don't lower v32i8 because there is no 128-bit byte mul 1088 1089 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1090 1091 setOperationAction(ISD::SRL, MVT::v4i64, Legal); 1092 setOperationAction(ISD::SRL, MVT::v8i32, Legal); 1093 1094 setOperationAction(ISD::SHL, MVT::v4i64, Legal); 1095 setOperationAction(ISD::SHL, MVT::v8i32, Legal); 1096 1097 setOperationAction(ISD::SRA, MVT::v8i32, Legal); 1098 } else { 1099 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1100 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1101 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1102 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1103 1104 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1105 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1106 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1107 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1108 1109 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1110 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1111 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1112 // Don't lower v32i8 because there is no 128-bit byte mul 1113 1114 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1115 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1116 1117 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1118 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1119 1120 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1121 } 1122 1123 // Custom lower several nodes for 256-bit types. 1124 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1125 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { 1126 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 1127 EVT VT = SVT; 1128 1129 // Extract subvector is special because the value type 1130 // (result) is 128-bit but the source is 256-bit wide. 1131 if (VT.is128BitVector()) 1132 setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom); 1133 1134 // Do not attempt to custom lower other non-256-bit vectors 1135 if (!VT.is256BitVector()) 1136 continue; 1137 1138 setOperationAction(ISD::BUILD_VECTOR, SVT, Custom); 1139 setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom); 1140 setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom); 1141 setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom); 1142 setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom); 1143 setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom); 1144 } 1145 1146 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1147 for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) { 1148 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 1149 EVT VT = SVT; 1150 1151 // Do not attempt to promote non-256-bit vectors 1152 if (!VT.is256BitVector()) 1153 continue; 1154 1155 setOperationAction(ISD::AND, SVT, Promote); 1156 AddPromotedToType (ISD::AND, SVT, MVT::v4i64); 1157 setOperationAction(ISD::OR, SVT, Promote); 1158 AddPromotedToType (ISD::OR, SVT, MVT::v4i64); 1159 setOperationAction(ISD::XOR, SVT, Promote); 1160 AddPromotedToType (ISD::XOR, SVT, MVT::v4i64); 1161 setOperationAction(ISD::LOAD, SVT, Promote); 1162 AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64); 1163 setOperationAction(ISD::SELECT, SVT, Promote); 1164 AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64); 1165 } 1166 } 1167 1168 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1169 // of this type with custom code. 1170 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1171 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { 1172 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, 1173 Custom); 1174 } 1175 1176 // We want to custom lower some of our intrinsics. 1177 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1178 1179 1180 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1181 // handle type legalization for these operations here. 1182 // 1183 // FIXME: We really should do custom legalization for addition and 1184 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1185 // than generic legalization for 64-bit multiplication-with-overflow, though. 1186 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1187 // Add/Sub/Mul with overflow operations are custom lowered. 1188 MVT VT = IntVTs[i]; 1189 setOperationAction(ISD::SADDO, VT, Custom); 1190 setOperationAction(ISD::UADDO, VT, Custom); 1191 setOperationAction(ISD::SSUBO, VT, Custom); 1192 setOperationAction(ISD::USUBO, VT, Custom); 1193 setOperationAction(ISD::SMULO, VT, Custom); 1194 setOperationAction(ISD::UMULO, VT, Custom); 1195 } 1196 1197 // There are no 8-bit 3-address imul/mul instructions 1198 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1199 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1200 1201 if (!Subtarget->is64Bit()) { 1202 // These libcalls are not available in 32-bit. 1203 setLibcallName(RTLIB::SHL_I128, 0); 1204 setLibcallName(RTLIB::SRL_I128, 0); 1205 setLibcallName(RTLIB::SRA_I128, 0); 1206 } 1207 1208 // We have target-specific dag combine patterns for the following nodes: 1209 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1210 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1211 setTargetDAGCombine(ISD::VSELECT); 1212 setTargetDAGCombine(ISD::SELECT); 1213 setTargetDAGCombine(ISD::SHL); 1214 setTargetDAGCombine(ISD::SRA); 1215 setTargetDAGCombine(ISD::SRL); 1216 setTargetDAGCombine(ISD::OR); 1217 setTargetDAGCombine(ISD::AND); 1218 setTargetDAGCombine(ISD::ADD); 1219 setTargetDAGCombine(ISD::FADD); 1220 setTargetDAGCombine(ISD::FSUB); 1221 setTargetDAGCombine(ISD::SUB); 1222 setTargetDAGCombine(ISD::LOAD); 1223 setTargetDAGCombine(ISD::STORE); 1224 setTargetDAGCombine(ISD::ZERO_EXTEND); 1225 setTargetDAGCombine(ISD::SIGN_EXTEND); 1226 setTargetDAGCombine(ISD::TRUNCATE); 1227 setTargetDAGCombine(ISD::SINT_TO_FP); 1228 if (Subtarget->is64Bit()) 1229 setTargetDAGCombine(ISD::MUL); 1230 if (Subtarget->hasBMI()) 1231 setTargetDAGCombine(ISD::XOR); 1232 1233 computeRegisterProperties(); 1234 1235 // On Darwin, -Os means optimize for size without hurting performance, 1236 // do not reduce the limit. 1237 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1238 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1239 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1240 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1241 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1242 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1243 setPrefLoopAlignment(4); // 2^4 bytes. 1244 benefitFromCodePlacementOpt = true; 1245 1246 setPrefFunctionAlignment(4); // 2^4 bytes. 1247} 1248 1249 1250EVT X86TargetLowering::getSetCCResultType(EVT VT) const { 1251 if (!VT.isVector()) return MVT::i8; 1252 return VT.changeVectorElementTypeToInteger(); 1253} 1254 1255 1256/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1257/// the desired ByVal argument alignment. 1258static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1259 if (MaxAlign == 16) 1260 return; 1261 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1262 if (VTy->getBitWidth() == 128) 1263 MaxAlign = 16; 1264 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1265 unsigned EltAlign = 0; 1266 getMaxByValAlign(ATy->getElementType(), EltAlign); 1267 if (EltAlign > MaxAlign) 1268 MaxAlign = EltAlign; 1269 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1270 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1271 unsigned EltAlign = 0; 1272 getMaxByValAlign(STy->getElementType(i), EltAlign); 1273 if (EltAlign > MaxAlign) 1274 MaxAlign = EltAlign; 1275 if (MaxAlign == 16) 1276 break; 1277 } 1278 } 1279 return; 1280} 1281 1282/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1283/// function arguments in the caller parameter area. For X86, aggregates 1284/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1285/// are at 4-byte boundaries. 1286unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1287 if (Subtarget->is64Bit()) { 1288 // Max of 8 and alignment of type. 1289 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1290 if (TyAlign > 8) 1291 return TyAlign; 1292 return 8; 1293 } 1294 1295 unsigned Align = 4; 1296 if (Subtarget->hasSSE1()) 1297 getMaxByValAlign(Ty, Align); 1298 return Align; 1299} 1300 1301/// getOptimalMemOpType - Returns the target specific optimal type for load 1302/// and store operations as a result of memset, memcpy, and memmove 1303/// lowering. If DstAlign is zero that means it's safe to destination 1304/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1305/// means there isn't a need to check it against alignment requirement, 1306/// probably because the source does not need to be loaded. If 1307/// 'IsZeroVal' is true, that means it's safe to return a 1308/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1309/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1310/// constant so it does not need to be loaded. 1311/// It returns EVT::Other if the type should be determined using generic 1312/// target-independent logic. 1313EVT 1314X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1315 unsigned DstAlign, unsigned SrcAlign, 1316 bool IsZeroVal, 1317 bool MemcpyStrSrc, 1318 MachineFunction &MF) const { 1319 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1320 // linux. This is because the stack realignment code can't handle certain 1321 // cases like PR2962. This should be removed when PR2962 is fixed. 1322 const Function *F = MF.getFunction(); 1323 if (IsZeroVal && 1324 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1325 if (Size >= 16 && 1326 (Subtarget->isUnalignedMemAccessFast() || 1327 ((DstAlign == 0 || DstAlign >= 16) && 1328 (SrcAlign == 0 || SrcAlign >= 16))) && 1329 Subtarget->getStackAlignment() >= 16) { 1330 if (Subtarget->getStackAlignment() >= 32) { 1331 if (Subtarget->hasAVX2()) 1332 return MVT::v8i32; 1333 if (Subtarget->hasAVX()) 1334 return MVT::v8f32; 1335 } 1336 if (Subtarget->hasSSE2()) 1337 return MVT::v4i32; 1338 if (Subtarget->hasSSE1()) 1339 return MVT::v4f32; 1340 } else if (!MemcpyStrSrc && Size >= 8 && 1341 !Subtarget->is64Bit() && 1342 Subtarget->getStackAlignment() >= 8 && 1343 Subtarget->hasSSE2()) { 1344 // Do not use f64 to lower memcpy if source is string constant. It's 1345 // better to use i32 to avoid the loads. 1346 return MVT::f64; 1347 } 1348 } 1349 if (Subtarget->is64Bit() && Size >= 8) 1350 return MVT::i64; 1351 return MVT::i32; 1352} 1353 1354/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1355/// current function. The returned value is a member of the 1356/// MachineJumpTableInfo::JTEntryKind enum. 1357unsigned X86TargetLowering::getJumpTableEncoding() const { 1358 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1359 // symbol. 1360 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1361 Subtarget->isPICStyleGOT()) 1362 return MachineJumpTableInfo::EK_Custom32; 1363 1364 // Otherwise, use the normal jump table encoding heuristics. 1365 return TargetLowering::getJumpTableEncoding(); 1366} 1367 1368const MCExpr * 1369X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1370 const MachineBasicBlock *MBB, 1371 unsigned uid,MCContext &Ctx) const{ 1372 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1373 Subtarget->isPICStyleGOT()); 1374 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1375 // entries. 1376 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1377 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1378} 1379 1380/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1381/// jumptable. 1382SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1383 SelectionDAG &DAG) const { 1384 if (!Subtarget->is64Bit()) 1385 // This doesn't have DebugLoc associated with it, but is not really the 1386 // same as a Register. 1387 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1388 return Table; 1389} 1390 1391/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1392/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1393/// MCExpr. 1394const MCExpr *X86TargetLowering:: 1395getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1396 MCContext &Ctx) const { 1397 // X86-64 uses RIP relative addressing based on the jump table label. 1398 if (Subtarget->isPICStyleRIPRel()) 1399 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1400 1401 // Otherwise, the reference is relative to the PIC base. 1402 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1403} 1404 1405// FIXME: Why this routine is here? Move to RegInfo! 1406std::pair<const TargetRegisterClass*, uint8_t> 1407X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1408 const TargetRegisterClass *RRC = 0; 1409 uint8_t Cost = 1; 1410 switch (VT.getSimpleVT().SimpleTy) { 1411 default: 1412 return TargetLowering::findRepresentativeClass(VT); 1413 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1414 RRC = (Subtarget->is64Bit() 1415 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1416 break; 1417 case MVT::x86mmx: 1418 RRC = X86::VR64RegisterClass; 1419 break; 1420 case MVT::f32: case MVT::f64: 1421 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1422 case MVT::v4f32: case MVT::v2f64: 1423 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1424 case MVT::v4f64: 1425 RRC = X86::VR128RegisterClass; 1426 break; 1427 } 1428 return std::make_pair(RRC, Cost); 1429} 1430 1431bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1432 unsigned &Offset) const { 1433 if (!Subtarget->isTargetLinux()) 1434 return false; 1435 1436 if (Subtarget->is64Bit()) { 1437 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1438 Offset = 0x28; 1439 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1440 AddressSpace = 256; 1441 else 1442 AddressSpace = 257; 1443 } else { 1444 // %gs:0x14 on i386 1445 Offset = 0x14; 1446 AddressSpace = 256; 1447 } 1448 return true; 1449} 1450 1451 1452//===----------------------------------------------------------------------===// 1453// Return Value Calling Convention Implementation 1454//===----------------------------------------------------------------------===// 1455 1456#include "X86GenCallingConv.inc" 1457 1458bool 1459X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1460 MachineFunction &MF, bool isVarArg, 1461 const SmallVectorImpl<ISD::OutputArg> &Outs, 1462 LLVMContext &Context) const { 1463 SmallVector<CCValAssign, 16> RVLocs; 1464 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1465 RVLocs, Context); 1466 return CCInfo.CheckReturn(Outs, RetCC_X86); 1467} 1468 1469SDValue 1470X86TargetLowering::LowerReturn(SDValue Chain, 1471 CallingConv::ID CallConv, bool isVarArg, 1472 const SmallVectorImpl<ISD::OutputArg> &Outs, 1473 const SmallVectorImpl<SDValue> &OutVals, 1474 DebugLoc dl, SelectionDAG &DAG) const { 1475 MachineFunction &MF = DAG.getMachineFunction(); 1476 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1477 1478 SmallVector<CCValAssign, 16> RVLocs; 1479 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1480 RVLocs, *DAG.getContext()); 1481 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1482 1483 // Add the regs to the liveout set for the function. 1484 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1485 for (unsigned i = 0; i != RVLocs.size(); ++i) 1486 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1487 MRI.addLiveOut(RVLocs[i].getLocReg()); 1488 1489 SDValue Flag; 1490 1491 SmallVector<SDValue, 6> RetOps; 1492 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1493 // Operand #1 = Bytes To Pop 1494 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1495 MVT::i16)); 1496 1497 // Copy the result values into the output registers. 1498 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1499 CCValAssign &VA = RVLocs[i]; 1500 assert(VA.isRegLoc() && "Can only return in registers!"); 1501 SDValue ValToCopy = OutVals[i]; 1502 EVT ValVT = ValToCopy.getValueType(); 1503 1504 // If this is x86-64, and we disabled SSE, we can't return FP values, 1505 // or SSE or MMX vectors. 1506 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1507 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1508 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1509 report_fatal_error("SSE register return with SSE disabled"); 1510 } 1511 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1512 // llvm-gcc has never done it right and no one has noticed, so this 1513 // should be OK for now. 1514 if (ValVT == MVT::f64 && 1515 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1516 report_fatal_error("SSE2 register return with SSE2 disabled"); 1517 1518 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1519 // the RET instruction and handled by the FP Stackifier. 1520 if (VA.getLocReg() == X86::ST0 || 1521 VA.getLocReg() == X86::ST1) { 1522 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1523 // change the value to the FP stack register class. 1524 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1525 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1526 RetOps.push_back(ValToCopy); 1527 // Don't emit a copytoreg. 1528 continue; 1529 } 1530 1531 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1532 // which is returned in RAX / RDX. 1533 if (Subtarget->is64Bit()) { 1534 if (ValVT == MVT::x86mmx) { 1535 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1536 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1537 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1538 ValToCopy); 1539 // If we don't have SSE2 available, convert to v4f32 so the generated 1540 // register is legal. 1541 if (!Subtarget->hasSSE2()) 1542 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1543 } 1544 } 1545 } 1546 1547 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1548 Flag = Chain.getValue(1); 1549 } 1550 1551 // The x86-64 ABI for returning structs by value requires that we copy 1552 // the sret argument into %rax for the return. We saved the argument into 1553 // a virtual register in the entry block, so now we copy the value out 1554 // and into %rax. 1555 if (Subtarget->is64Bit() && 1556 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1557 MachineFunction &MF = DAG.getMachineFunction(); 1558 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1559 unsigned Reg = FuncInfo->getSRetReturnReg(); 1560 assert(Reg && 1561 "SRetReturnReg should have been set in LowerFormalArguments()."); 1562 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1563 1564 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1565 Flag = Chain.getValue(1); 1566 1567 // RAX now acts like a return value. 1568 MRI.addLiveOut(X86::RAX); 1569 } 1570 1571 RetOps[0] = Chain; // Update chain. 1572 1573 // Add the flag if we have it. 1574 if (Flag.getNode()) 1575 RetOps.push_back(Flag); 1576 1577 return DAG.getNode(X86ISD::RET_FLAG, dl, 1578 MVT::Other, &RetOps[0], RetOps.size()); 1579} 1580 1581bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1582 if (N->getNumValues() != 1) 1583 return false; 1584 if (!N->hasNUsesOfValue(1, 0)) 1585 return false; 1586 1587 SDNode *Copy = *N->use_begin(); 1588 if (Copy->getOpcode() == ISD::CopyToReg) { 1589 // If the copy has a glue operand, we conservatively assume it isn't safe to 1590 // perform a tail call. 1591 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 1592 return false; 1593 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 1594 return false; 1595 1596 bool HasRet = false; 1597 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1598 UI != UE; ++UI) { 1599 if (UI->getOpcode() != X86ISD::RET_FLAG) 1600 return false; 1601 HasRet = true; 1602 } 1603 1604 return HasRet; 1605} 1606 1607EVT 1608X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1609 ISD::NodeType ExtendKind) const { 1610 MVT ReturnMVT; 1611 // TODO: Is this also valid on 32-bit? 1612 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1613 ReturnMVT = MVT::i8; 1614 else 1615 ReturnMVT = MVT::i32; 1616 1617 EVT MinVT = getRegisterType(Context, ReturnMVT); 1618 return VT.bitsLT(MinVT) ? MinVT : VT; 1619} 1620 1621/// LowerCallResult - Lower the result values of a call into the 1622/// appropriate copies out of appropriate physical registers. 1623/// 1624SDValue 1625X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1626 CallingConv::ID CallConv, bool isVarArg, 1627 const SmallVectorImpl<ISD::InputArg> &Ins, 1628 DebugLoc dl, SelectionDAG &DAG, 1629 SmallVectorImpl<SDValue> &InVals) const { 1630 1631 // Assign locations to each value returned by this call. 1632 SmallVector<CCValAssign, 16> RVLocs; 1633 bool Is64Bit = Subtarget->is64Bit(); 1634 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1635 getTargetMachine(), RVLocs, *DAG.getContext()); 1636 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1637 1638 // Copy all of the result registers out of their specified physreg. 1639 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1640 CCValAssign &VA = RVLocs[i]; 1641 EVT CopyVT = VA.getValVT(); 1642 1643 // If this is x86-64, and we disabled SSE, we can't return FP values 1644 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1645 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1646 report_fatal_error("SSE register return with SSE disabled"); 1647 } 1648 1649 SDValue Val; 1650 1651 // If this is a call to a function that returns an fp value on the floating 1652 // point stack, we must guarantee the the value is popped from the stack, so 1653 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1654 // if the return value is not used. We use the FpPOP_RETVAL instruction 1655 // instead. 1656 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1657 // If we prefer to use the value in xmm registers, copy it out as f80 and 1658 // use a truncate to move it from fp stack reg to xmm reg. 1659 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1660 SDValue Ops[] = { Chain, InFlag }; 1661 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1662 MVT::Other, MVT::Glue, Ops, 2), 1); 1663 Val = Chain.getValue(0); 1664 1665 // Round the f80 to the right size, which also moves it to the appropriate 1666 // xmm register. 1667 if (CopyVT != VA.getValVT()) 1668 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1669 // This truncation won't change the value. 1670 DAG.getIntPtrConstant(1)); 1671 } else { 1672 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1673 CopyVT, InFlag).getValue(1); 1674 Val = Chain.getValue(0); 1675 } 1676 InFlag = Chain.getValue(2); 1677 InVals.push_back(Val); 1678 } 1679 1680 return Chain; 1681} 1682 1683 1684//===----------------------------------------------------------------------===// 1685// C & StdCall & Fast Calling Convention implementation 1686//===----------------------------------------------------------------------===// 1687// StdCall calling convention seems to be standard for many Windows' API 1688// routines and around. It differs from C calling convention just a little: 1689// callee should clean up the stack, not caller. Symbols should be also 1690// decorated in some fancy way :) It doesn't support any vector arguments. 1691// For info on fast calling convention see Fast Calling Convention (tail call) 1692// implementation LowerX86_32FastCCCallTo. 1693 1694/// CallIsStructReturn - Determines whether a call uses struct return 1695/// semantics. 1696static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1697 if (Outs.empty()) 1698 return false; 1699 1700 return Outs[0].Flags.isSRet(); 1701} 1702 1703/// ArgsAreStructReturn - Determines whether a function uses struct 1704/// return semantics. 1705static bool 1706ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1707 if (Ins.empty()) 1708 return false; 1709 1710 return Ins[0].Flags.isSRet(); 1711} 1712 1713/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1714/// by "Src" to address "Dst" with size and alignment information specified by 1715/// the specific parameter attribute. The copy will be passed as a byval 1716/// function parameter. 1717static SDValue 1718CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1719 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1720 DebugLoc dl) { 1721 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1722 1723 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1724 /*isVolatile*/false, /*AlwaysInline=*/true, 1725 MachinePointerInfo(), MachinePointerInfo()); 1726} 1727 1728/// IsTailCallConvention - Return true if the calling convention is one that 1729/// supports tail call optimization. 1730static bool IsTailCallConvention(CallingConv::ID CC) { 1731 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1732} 1733 1734bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1735 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) 1736 return false; 1737 1738 CallSite CS(CI); 1739 CallingConv::ID CalleeCC = CS.getCallingConv(); 1740 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1741 return false; 1742 1743 return true; 1744} 1745 1746/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1747/// a tailcall target by changing its ABI. 1748static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 1749 bool GuaranteedTailCallOpt) { 1750 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1751} 1752 1753SDValue 1754X86TargetLowering::LowerMemArgument(SDValue Chain, 1755 CallingConv::ID CallConv, 1756 const SmallVectorImpl<ISD::InputArg> &Ins, 1757 DebugLoc dl, SelectionDAG &DAG, 1758 const CCValAssign &VA, 1759 MachineFrameInfo *MFI, 1760 unsigned i) const { 1761 // Create the nodes corresponding to a load from this parameter slot. 1762 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1763 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, 1764 getTargetMachine().Options.GuaranteedTailCallOpt); 1765 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1766 EVT ValVT; 1767 1768 // If value is passed by pointer we have address passed instead of the value 1769 // itself. 1770 if (VA.getLocInfo() == CCValAssign::Indirect) 1771 ValVT = VA.getLocVT(); 1772 else 1773 ValVT = VA.getValVT(); 1774 1775 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1776 // changed with more analysis. 1777 // In case of tail call optimization mark all arguments mutable. Since they 1778 // could be overwritten by lowering of arguments in case of a tail call. 1779 if (Flags.isByVal()) { 1780 unsigned Bytes = Flags.getByValSize(); 1781 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1782 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1783 return DAG.getFrameIndex(FI, getPointerTy()); 1784 } else { 1785 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1786 VA.getLocMemOffset(), isImmutable); 1787 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1788 return DAG.getLoad(ValVT, dl, Chain, FIN, 1789 MachinePointerInfo::getFixedStack(FI), 1790 false, false, false, 0); 1791 } 1792} 1793 1794SDValue 1795X86TargetLowering::LowerFormalArguments(SDValue Chain, 1796 CallingConv::ID CallConv, 1797 bool isVarArg, 1798 const SmallVectorImpl<ISD::InputArg> &Ins, 1799 DebugLoc dl, 1800 SelectionDAG &DAG, 1801 SmallVectorImpl<SDValue> &InVals) 1802 const { 1803 MachineFunction &MF = DAG.getMachineFunction(); 1804 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1805 1806 const Function* Fn = MF.getFunction(); 1807 if (Fn->hasExternalLinkage() && 1808 Subtarget->isTargetCygMing() && 1809 Fn->getName() == "main") 1810 FuncInfo->setForceFramePointer(true); 1811 1812 MachineFrameInfo *MFI = MF.getFrameInfo(); 1813 bool Is64Bit = Subtarget->is64Bit(); 1814 bool IsWindows = Subtarget->isTargetWindows(); 1815 bool IsWin64 = Subtarget->isTargetWin64(); 1816 1817 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1818 "Var args not supported with calling convention fastcc or ghc"); 1819 1820 // Assign locations to all of the incoming arguments. 1821 SmallVector<CCValAssign, 16> ArgLocs; 1822 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1823 ArgLocs, *DAG.getContext()); 1824 1825 // Allocate shadow area for Win64 1826 if (IsWin64) { 1827 CCInfo.AllocateStack(32, 8); 1828 } 1829 1830 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1831 1832 unsigned LastVal = ~0U; 1833 SDValue ArgValue; 1834 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1835 CCValAssign &VA = ArgLocs[i]; 1836 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1837 // places. 1838 assert(VA.getValNo() != LastVal && 1839 "Don't support value assigned to multiple locs yet"); 1840 (void)LastVal; 1841 LastVal = VA.getValNo(); 1842 1843 if (VA.isRegLoc()) { 1844 EVT RegVT = VA.getLocVT(); 1845 const TargetRegisterClass *RC; 1846 if (RegVT == MVT::i32) 1847 RC = X86::GR32RegisterClass; 1848 else if (Is64Bit && RegVT == MVT::i64) 1849 RC = X86::GR64RegisterClass; 1850 else if (RegVT == MVT::f32) 1851 RC = X86::FR32RegisterClass; 1852 else if (RegVT == MVT::f64) 1853 RC = X86::FR64RegisterClass; 1854 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1855 RC = X86::VR256RegisterClass; 1856 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1857 RC = X86::VR128RegisterClass; 1858 else if (RegVT == MVT::x86mmx) 1859 RC = X86::VR64RegisterClass; 1860 else 1861 llvm_unreachable("Unknown argument type!"); 1862 1863 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1864 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1865 1866 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1867 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1868 // right size. 1869 if (VA.getLocInfo() == CCValAssign::SExt) 1870 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1871 DAG.getValueType(VA.getValVT())); 1872 else if (VA.getLocInfo() == CCValAssign::ZExt) 1873 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1874 DAG.getValueType(VA.getValVT())); 1875 else if (VA.getLocInfo() == CCValAssign::BCvt) 1876 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1877 1878 if (VA.isExtInLoc()) { 1879 // Handle MMX values passed in XMM regs. 1880 if (RegVT.isVector()) { 1881 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1882 ArgValue); 1883 } else 1884 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1885 } 1886 } else { 1887 assert(VA.isMemLoc()); 1888 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1889 } 1890 1891 // If value is passed via pointer - do a load. 1892 if (VA.getLocInfo() == CCValAssign::Indirect) 1893 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1894 MachinePointerInfo(), false, false, false, 0); 1895 1896 InVals.push_back(ArgValue); 1897 } 1898 1899 // The x86-64 ABI for returning structs by value requires that we copy 1900 // the sret argument into %rax for the return. Save the argument into 1901 // a virtual register so that we can access it from the return points. 1902 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1903 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1904 unsigned Reg = FuncInfo->getSRetReturnReg(); 1905 if (!Reg) { 1906 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1907 FuncInfo->setSRetReturnReg(Reg); 1908 } 1909 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1910 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1911 } 1912 1913 unsigned StackSize = CCInfo.getNextStackOffset(); 1914 // Align stack specially for tail calls. 1915 if (FuncIsMadeTailCallSafe(CallConv, 1916 MF.getTarget().Options.GuaranteedTailCallOpt)) 1917 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1918 1919 // If the function takes variable number of arguments, make a frame index for 1920 // the start of the first vararg value... for expansion of llvm.va_start. 1921 if (isVarArg) { 1922 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1923 CallConv != CallingConv::X86_ThisCall)) { 1924 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1925 } 1926 if (Is64Bit) { 1927 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1928 1929 // FIXME: We should really autogenerate these arrays 1930 static const uint16_t GPR64ArgRegsWin64[] = { 1931 X86::RCX, X86::RDX, X86::R8, X86::R9 1932 }; 1933 static const uint16_t GPR64ArgRegs64Bit[] = { 1934 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1935 }; 1936 static const uint16_t XMMArgRegs64Bit[] = { 1937 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1938 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1939 }; 1940 const uint16_t *GPR64ArgRegs; 1941 unsigned NumXMMRegs = 0; 1942 1943 if (IsWin64) { 1944 // The XMM registers which might contain var arg parameters are shadowed 1945 // in their paired GPR. So we only need to save the GPR to their home 1946 // slots. 1947 TotalNumIntRegs = 4; 1948 GPR64ArgRegs = GPR64ArgRegsWin64; 1949 } else { 1950 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1951 GPR64ArgRegs = GPR64ArgRegs64Bit; 1952 1953 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, 1954 TotalNumXMMRegs); 1955 } 1956 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1957 TotalNumIntRegs); 1958 1959 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1960 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1961 "SSE register cannot be used when SSE is disabled!"); 1962 assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && 1963 NoImplicitFloatOps) && 1964 "SSE register cannot be used when SSE is disabled!"); 1965 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 1966 !Subtarget->hasSSE1()) 1967 // Kernel mode asks for SSE to be disabled, so don't push them 1968 // on the stack. 1969 TotalNumXMMRegs = 0; 1970 1971 if (IsWin64) { 1972 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1973 // Get to the caller-allocated home save location. Add 8 to account 1974 // for the return address. 1975 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1976 FuncInfo->setRegSaveFrameIndex( 1977 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1978 // Fixup to set vararg frame on shadow area (4 x i64). 1979 if (NumIntRegs < 4) 1980 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1981 } else { 1982 // For X86-64, if there are vararg parameters that are passed via 1983 // registers, then we must store them to their spots on the stack so 1984 // they may be loaded by deferencing the result of va_next. 1985 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1986 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1987 FuncInfo->setRegSaveFrameIndex( 1988 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1989 false)); 1990 } 1991 1992 // Store the integer parameter registers. 1993 SmallVector<SDValue, 8> MemOps; 1994 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1995 getPointerTy()); 1996 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1997 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1998 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1999 DAG.getIntPtrConstant(Offset)); 2000 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 2001 X86::GR64RegisterClass); 2002 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 2003 SDValue Store = 2004 DAG.getStore(Val.getValue(1), dl, Val, FIN, 2005 MachinePointerInfo::getFixedStack( 2006 FuncInfo->getRegSaveFrameIndex(), Offset), 2007 false, false, 0); 2008 MemOps.push_back(Store); 2009 Offset += 8; 2010 } 2011 2012 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 2013 // Now store the XMM (fp + vector) parameter registers. 2014 SmallVector<SDValue, 11> SaveXMMOps; 2015 SaveXMMOps.push_back(Chain); 2016 2017 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 2018 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 2019 SaveXMMOps.push_back(ALVal); 2020 2021 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2022 FuncInfo->getRegSaveFrameIndex())); 2023 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2024 FuncInfo->getVarArgsFPOffset())); 2025 2026 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 2027 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 2028 X86::VR128RegisterClass); 2029 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 2030 SaveXMMOps.push_back(Val); 2031 } 2032 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2033 MVT::Other, 2034 &SaveXMMOps[0], SaveXMMOps.size())); 2035 } 2036 2037 if (!MemOps.empty()) 2038 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2039 &MemOps[0], MemOps.size()); 2040 } 2041 } 2042 2043 // Some CCs need callee pop. 2044 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2045 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2046 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2047 } else { 2048 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2049 // If this is an sret function, the return should pop the hidden pointer. 2050 if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2051 ArgsAreStructReturn(Ins)) 2052 FuncInfo->setBytesToPopOnReturn(4); 2053 } 2054 2055 if (!Is64Bit) { 2056 // RegSaveFrameIndex is X86-64 only. 2057 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2058 if (CallConv == CallingConv::X86_FastCall || 2059 CallConv == CallingConv::X86_ThisCall) 2060 // fastcc functions can't have varargs. 2061 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2062 } 2063 2064 FuncInfo->setArgumentStackSize(StackSize); 2065 2066 return Chain; 2067} 2068 2069SDValue 2070X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2071 SDValue StackPtr, SDValue Arg, 2072 DebugLoc dl, SelectionDAG &DAG, 2073 const CCValAssign &VA, 2074 ISD::ArgFlagsTy Flags) const { 2075 unsigned LocMemOffset = VA.getLocMemOffset(); 2076 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 2077 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 2078 if (Flags.isByVal()) 2079 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2080 2081 return DAG.getStore(Chain, dl, Arg, PtrOff, 2082 MachinePointerInfo::getStack(LocMemOffset), 2083 false, false, 0); 2084} 2085 2086/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 2087/// optimization is performed and it is required. 2088SDValue 2089X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 2090 SDValue &OutRetAddr, SDValue Chain, 2091 bool IsTailCall, bool Is64Bit, 2092 int FPDiff, DebugLoc dl) const { 2093 // Adjust the Return address stack slot. 2094 EVT VT = getPointerTy(); 2095 OutRetAddr = getReturnAddressFrameIndex(DAG); 2096 2097 // Load the "old" Return address. 2098 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2099 false, false, false, 0); 2100 return SDValue(OutRetAddr.getNode(), 1); 2101} 2102 2103/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 2104/// optimization is performed and it is required (FPDiff!=0). 2105static SDValue 2106EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 2107 SDValue Chain, SDValue RetAddrFrIdx, 2108 bool Is64Bit, int FPDiff, DebugLoc dl) { 2109 // Store the return address to the appropriate stack slot. 2110 if (!FPDiff) return Chain; 2111 // Calculate the new stack slot for the return address. 2112 int SlotSize = Is64Bit ? 8 : 4; 2113 int NewReturnAddrFI = 2114 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 2115 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 2116 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 2117 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2118 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 2119 false, false, 0); 2120 return Chain; 2121} 2122 2123SDValue 2124X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 2125 CallingConv::ID CallConv, bool isVarArg, 2126 bool doesNotRet, bool &isTailCall, 2127 const SmallVectorImpl<ISD::OutputArg> &Outs, 2128 const SmallVectorImpl<SDValue> &OutVals, 2129 const SmallVectorImpl<ISD::InputArg> &Ins, 2130 DebugLoc dl, SelectionDAG &DAG, 2131 SmallVectorImpl<SDValue> &InVals) const { 2132 MachineFunction &MF = DAG.getMachineFunction(); 2133 bool Is64Bit = Subtarget->is64Bit(); 2134 bool IsWin64 = Subtarget->isTargetWin64(); 2135 bool IsWindows = Subtarget->isTargetWindows(); 2136 bool IsStructRet = CallIsStructReturn(Outs); 2137 bool IsSibcall = false; 2138 2139 if (MF.getTarget().Options.DisableTailCalls) 2140 isTailCall = false; 2141 2142 if (isTailCall) { 2143 // Check if it's really possible to do a tail call. 2144 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2145 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 2146 Outs, OutVals, Ins, DAG); 2147 2148 // Sibcalls are automatically detected tailcalls which do not require 2149 // ABI changes. 2150 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 2151 IsSibcall = true; 2152 2153 if (isTailCall) 2154 ++NumTailCalls; 2155 } 2156 2157 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2158 "Var args not supported with calling convention fastcc or ghc"); 2159 2160 // Analyze operands of the call, assigning locations to each operand. 2161 SmallVector<CCValAssign, 16> ArgLocs; 2162 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2163 ArgLocs, *DAG.getContext()); 2164 2165 // Allocate shadow area for Win64 2166 if (IsWin64) { 2167 CCInfo.AllocateStack(32, 8); 2168 } 2169 2170 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2171 2172 // Get a count of how many bytes are to be pushed on the stack. 2173 unsigned NumBytes = CCInfo.getNextStackOffset(); 2174 if (IsSibcall) 2175 // This is a sibcall. The memory operands are available in caller's 2176 // own caller's stack. 2177 NumBytes = 0; 2178 else if (getTargetMachine().Options.GuaranteedTailCallOpt && 2179 IsTailCallConvention(CallConv)) 2180 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2181 2182 int FPDiff = 0; 2183 if (isTailCall && !IsSibcall) { 2184 // Lower arguments at fp - stackoffset + fpdiff. 2185 unsigned NumBytesCallerPushed = 2186 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 2187 FPDiff = NumBytesCallerPushed - NumBytes; 2188 2189 // Set the delta of movement of the returnaddr stackslot. 2190 // But only set if delta is greater than previous delta. 2191 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2192 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2193 } 2194 2195 if (!IsSibcall) 2196 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2197 2198 SDValue RetAddrFrIdx; 2199 // Load return address for tail calls. 2200 if (isTailCall && FPDiff) 2201 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2202 Is64Bit, FPDiff, dl); 2203 2204 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2205 SmallVector<SDValue, 8> MemOpChains; 2206 SDValue StackPtr; 2207 2208 // Walk the register/memloc assignments, inserting copies/loads. In the case 2209 // of tail call optimization arguments are handle later. 2210 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2211 CCValAssign &VA = ArgLocs[i]; 2212 EVT RegVT = VA.getLocVT(); 2213 SDValue Arg = OutVals[i]; 2214 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2215 bool isByVal = Flags.isByVal(); 2216 2217 // Promote the value if needed. 2218 switch (VA.getLocInfo()) { 2219 default: llvm_unreachable("Unknown loc info!"); 2220 case CCValAssign::Full: break; 2221 case CCValAssign::SExt: 2222 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2223 break; 2224 case CCValAssign::ZExt: 2225 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2226 break; 2227 case CCValAssign::AExt: 2228 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 2229 // Special case: passing MMX values in XMM registers. 2230 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2231 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2232 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2233 } else 2234 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2235 break; 2236 case CCValAssign::BCvt: 2237 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2238 break; 2239 case CCValAssign::Indirect: { 2240 // Store the argument. 2241 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2242 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2243 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2244 MachinePointerInfo::getFixedStack(FI), 2245 false, false, 0); 2246 Arg = SpillSlot; 2247 break; 2248 } 2249 } 2250 2251 if (VA.isRegLoc()) { 2252 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2253 if (isVarArg && IsWin64) { 2254 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2255 // shadow reg if callee is a varargs function. 2256 unsigned ShadowReg = 0; 2257 switch (VA.getLocReg()) { 2258 case X86::XMM0: ShadowReg = X86::RCX; break; 2259 case X86::XMM1: ShadowReg = X86::RDX; break; 2260 case X86::XMM2: ShadowReg = X86::R8; break; 2261 case X86::XMM3: ShadowReg = X86::R9; break; 2262 } 2263 if (ShadowReg) 2264 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2265 } 2266 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2267 assert(VA.isMemLoc()); 2268 if (StackPtr.getNode() == 0) 2269 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2270 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2271 dl, DAG, VA, Flags)); 2272 } 2273 } 2274 2275 if (!MemOpChains.empty()) 2276 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2277 &MemOpChains[0], MemOpChains.size()); 2278 2279 // Build a sequence of copy-to-reg nodes chained together with token chain 2280 // and flag operands which copy the outgoing args into registers. 2281 SDValue InFlag; 2282 // Tail call byval lowering might overwrite argument registers so in case of 2283 // tail call optimization the copies to registers are lowered later. 2284 if (!isTailCall) 2285 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2286 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2287 RegsToPass[i].second, InFlag); 2288 InFlag = Chain.getValue(1); 2289 } 2290 2291 if (Subtarget->isPICStyleGOT()) { 2292 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2293 // GOT pointer. 2294 if (!isTailCall) { 2295 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2296 DAG.getNode(X86ISD::GlobalBaseReg, 2297 DebugLoc(), getPointerTy()), 2298 InFlag); 2299 InFlag = Chain.getValue(1); 2300 } else { 2301 // If we are tail calling and generating PIC/GOT style code load the 2302 // address of the callee into ECX. The value in ecx is used as target of 2303 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2304 // for tail calls on PIC/GOT architectures. Normally we would just put the 2305 // address of GOT into ebx and then call target@PLT. But for tail calls 2306 // ebx would be restored (since ebx is callee saved) before jumping to the 2307 // target@PLT. 2308 2309 // Note: The actual moving to ECX is done further down. 2310 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2311 if (G && !G->getGlobal()->hasHiddenVisibility() && 2312 !G->getGlobal()->hasProtectedVisibility()) 2313 Callee = LowerGlobalAddress(Callee, DAG); 2314 else if (isa<ExternalSymbolSDNode>(Callee)) 2315 Callee = LowerExternalSymbol(Callee, DAG); 2316 } 2317 } 2318 2319 if (Is64Bit && isVarArg && !IsWin64) { 2320 // From AMD64 ABI document: 2321 // For calls that may call functions that use varargs or stdargs 2322 // (prototype-less calls or calls to functions containing ellipsis (...) in 2323 // the declaration) %al is used as hidden argument to specify the number 2324 // of SSE registers used. The contents of %al do not need to match exactly 2325 // the number of registers, but must be an ubound on the number of SSE 2326 // registers used and is in the range 0 - 8 inclusive. 2327 2328 // Count the number of XMM registers allocated. 2329 static const uint16_t XMMArgRegs[] = { 2330 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2331 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2332 }; 2333 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2334 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2335 && "SSE registers cannot be used when SSE is disabled"); 2336 2337 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2338 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2339 InFlag = Chain.getValue(1); 2340 } 2341 2342 2343 // For tail calls lower the arguments to the 'real' stack slot. 2344 if (isTailCall) { 2345 // Force all the incoming stack arguments to be loaded from the stack 2346 // before any new outgoing arguments are stored to the stack, because the 2347 // outgoing stack slots may alias the incoming argument stack slots, and 2348 // the alias isn't otherwise explicit. This is slightly more conservative 2349 // than necessary, because it means that each store effectively depends 2350 // on every argument instead of just those arguments it would clobber. 2351 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2352 2353 SmallVector<SDValue, 8> MemOpChains2; 2354 SDValue FIN; 2355 int FI = 0; 2356 // Do not flag preceding copytoreg stuff together with the following stuff. 2357 InFlag = SDValue(); 2358 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2359 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2360 CCValAssign &VA = ArgLocs[i]; 2361 if (VA.isRegLoc()) 2362 continue; 2363 assert(VA.isMemLoc()); 2364 SDValue Arg = OutVals[i]; 2365 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2366 // Create frame index. 2367 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2368 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2369 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2370 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2371 2372 if (Flags.isByVal()) { 2373 // Copy relative to framepointer. 2374 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2375 if (StackPtr.getNode() == 0) 2376 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2377 getPointerTy()); 2378 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2379 2380 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2381 ArgChain, 2382 Flags, DAG, dl)); 2383 } else { 2384 // Store relative to framepointer. 2385 MemOpChains2.push_back( 2386 DAG.getStore(ArgChain, dl, Arg, FIN, 2387 MachinePointerInfo::getFixedStack(FI), 2388 false, false, 0)); 2389 } 2390 } 2391 } 2392 2393 if (!MemOpChains2.empty()) 2394 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2395 &MemOpChains2[0], MemOpChains2.size()); 2396 2397 // Copy arguments to their registers. 2398 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2399 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2400 RegsToPass[i].second, InFlag); 2401 InFlag = Chain.getValue(1); 2402 } 2403 InFlag =SDValue(); 2404 2405 // Store the return address to the appropriate stack slot. 2406 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2407 FPDiff, dl); 2408 } 2409 2410 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2411 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2412 // In the 64-bit large code model, we have to make all calls 2413 // through a register, since the call instruction's 32-bit 2414 // pc-relative offset may not be large enough to hold the whole 2415 // address. 2416 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2417 // If the callee is a GlobalAddress node (quite common, every direct call 2418 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2419 // it. 2420 2421 // We should use extra load for direct calls to dllimported functions in 2422 // non-JIT mode. 2423 const GlobalValue *GV = G->getGlobal(); 2424 if (!GV->hasDLLImportLinkage()) { 2425 unsigned char OpFlags = 0; 2426 bool ExtraLoad = false; 2427 unsigned WrapperKind = ISD::DELETED_NODE; 2428 2429 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2430 // external symbols most go through the PLT in PIC mode. If the symbol 2431 // has hidden or protected visibility, or if it is static or local, then 2432 // we don't need to use the PLT - we can directly call it. 2433 if (Subtarget->isTargetELF() && 2434 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2435 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2436 OpFlags = X86II::MO_PLT; 2437 } else if (Subtarget->isPICStyleStubAny() && 2438 (GV->isDeclaration() || GV->isWeakForLinker()) && 2439 (!Subtarget->getTargetTriple().isMacOSX() || 2440 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2441 // PC-relative references to external symbols should go through $stub, 2442 // unless we're building with the leopard linker or later, which 2443 // automatically synthesizes these stubs. 2444 OpFlags = X86II::MO_DARWIN_STUB; 2445 } else if (Subtarget->isPICStyleRIPRel() && 2446 isa<Function>(GV) && 2447 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { 2448 // If the function is marked as non-lazy, generate an indirect call 2449 // which loads from the GOT directly. This avoids runtime overhead 2450 // at the cost of eager binding (and one extra byte of encoding). 2451 OpFlags = X86II::MO_GOTPCREL; 2452 WrapperKind = X86ISD::WrapperRIP; 2453 ExtraLoad = true; 2454 } 2455 2456 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2457 G->getOffset(), OpFlags); 2458 2459 // Add a wrapper if needed. 2460 if (WrapperKind != ISD::DELETED_NODE) 2461 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2462 // Add extra indirection if needed. 2463 if (ExtraLoad) 2464 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2465 MachinePointerInfo::getGOT(), 2466 false, false, false, 0); 2467 } 2468 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2469 unsigned char OpFlags = 0; 2470 2471 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2472 // external symbols should go through the PLT. 2473 if (Subtarget->isTargetELF() && 2474 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2475 OpFlags = X86II::MO_PLT; 2476 } else if (Subtarget->isPICStyleStubAny() && 2477 (!Subtarget->getTargetTriple().isMacOSX() || 2478 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2479 // PC-relative references to external symbols should go through $stub, 2480 // unless we're building with the leopard linker or later, which 2481 // automatically synthesizes these stubs. 2482 OpFlags = X86II::MO_DARWIN_STUB; 2483 } 2484 2485 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2486 OpFlags); 2487 } 2488 2489 // Returns a chain & a flag for retval copy to use. 2490 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2491 SmallVector<SDValue, 8> Ops; 2492 2493 if (!IsSibcall && isTailCall) { 2494 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2495 DAG.getIntPtrConstant(0, true), InFlag); 2496 InFlag = Chain.getValue(1); 2497 } 2498 2499 Ops.push_back(Chain); 2500 Ops.push_back(Callee); 2501 2502 if (isTailCall) 2503 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2504 2505 // Add argument registers to the end of the list so that they are known live 2506 // into the call. 2507 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2508 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2509 RegsToPass[i].second.getValueType())); 2510 2511 // Add an implicit use GOT pointer in EBX. 2512 if (!isTailCall && Subtarget->isPICStyleGOT()) 2513 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2514 2515 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2516 if (Is64Bit && isVarArg && !IsWin64) 2517 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2518 2519 // Add a register mask operand representing the call-preserved registers. 2520 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 2521 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 2522 assert(Mask && "Missing call preserved mask for calling convention"); 2523 Ops.push_back(DAG.getRegisterMask(Mask)); 2524 2525 if (InFlag.getNode()) 2526 Ops.push_back(InFlag); 2527 2528 if (isTailCall) { 2529 // We used to do: 2530 //// If this is the first return lowered for this function, add the regs 2531 //// to the liveout set for the function. 2532 // This isn't right, although it's probably harmless on x86; liveouts 2533 // should be computed from returns not tail calls. Consider a void 2534 // function making a tail call to a function returning int. 2535 return DAG.getNode(X86ISD::TC_RETURN, dl, 2536 NodeTys, &Ops[0], Ops.size()); 2537 } 2538 2539 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2540 InFlag = Chain.getValue(1); 2541 2542 // Create the CALLSEQ_END node. 2543 unsigned NumBytesForCalleeToPush; 2544 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2545 getTargetMachine().Options.GuaranteedTailCallOpt)) 2546 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2547 else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && 2548 IsStructRet) 2549 // If this is a call to a struct-return function, the callee 2550 // pops the hidden struct pointer, so we have to push it back. 2551 // This is common for Darwin/X86, Linux & Mingw32 targets. 2552 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 2553 NumBytesForCalleeToPush = 4; 2554 else 2555 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2556 2557 // Returns a flag for retval copy to use. 2558 if (!IsSibcall) { 2559 Chain = DAG.getCALLSEQ_END(Chain, 2560 DAG.getIntPtrConstant(NumBytes, true), 2561 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2562 true), 2563 InFlag); 2564 InFlag = Chain.getValue(1); 2565 } 2566 2567 // Handle result values, copying them out of physregs into vregs that we 2568 // return. 2569 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2570 Ins, dl, DAG, InVals); 2571} 2572 2573 2574//===----------------------------------------------------------------------===// 2575// Fast Calling Convention (tail call) implementation 2576//===----------------------------------------------------------------------===// 2577 2578// Like std call, callee cleans arguments, convention except that ECX is 2579// reserved for storing the tail called function address. Only 2 registers are 2580// free for argument passing (inreg). Tail call optimization is performed 2581// provided: 2582// * tailcallopt is enabled 2583// * caller/callee are fastcc 2584// On X86_64 architecture with GOT-style position independent code only local 2585// (within module) calls are supported at the moment. 2586// To keep the stack aligned according to platform abi the function 2587// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2588// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2589// If a tail called function callee has more arguments than the caller the 2590// caller needs to make sure that there is room to move the RETADDR to. This is 2591// achieved by reserving an area the size of the argument delta right after the 2592// original REtADDR, but before the saved framepointer or the spilled registers 2593// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2594// stack layout: 2595// arg1 2596// arg2 2597// RETADDR 2598// [ new RETADDR 2599// move area ] 2600// (possible EBP) 2601// ESI 2602// EDI 2603// local1 .. 2604 2605/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2606/// for a 16 byte align requirement. 2607unsigned 2608X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2609 SelectionDAG& DAG) const { 2610 MachineFunction &MF = DAG.getMachineFunction(); 2611 const TargetMachine &TM = MF.getTarget(); 2612 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2613 unsigned StackAlignment = TFI.getStackAlignment(); 2614 uint64_t AlignMask = StackAlignment - 1; 2615 int64_t Offset = StackSize; 2616 uint64_t SlotSize = TD->getPointerSize(); 2617 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2618 // Number smaller than 12 so just add the difference. 2619 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2620 } else { 2621 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2622 Offset = ((~AlignMask) & Offset) + StackAlignment + 2623 (StackAlignment-SlotSize); 2624 } 2625 return Offset; 2626} 2627 2628/// MatchingStackOffset - Return true if the given stack call argument is 2629/// already available in the same position (relatively) of the caller's 2630/// incoming argument stack. 2631static 2632bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2633 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2634 const X86InstrInfo *TII) { 2635 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2636 int FI = INT_MAX; 2637 if (Arg.getOpcode() == ISD::CopyFromReg) { 2638 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2639 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2640 return false; 2641 MachineInstr *Def = MRI->getVRegDef(VR); 2642 if (!Def) 2643 return false; 2644 if (!Flags.isByVal()) { 2645 if (!TII->isLoadFromStackSlot(Def, FI)) 2646 return false; 2647 } else { 2648 unsigned Opcode = Def->getOpcode(); 2649 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2650 Def->getOperand(1).isFI()) { 2651 FI = Def->getOperand(1).getIndex(); 2652 Bytes = Flags.getByValSize(); 2653 } else 2654 return false; 2655 } 2656 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2657 if (Flags.isByVal()) 2658 // ByVal argument is passed in as a pointer but it's now being 2659 // dereferenced. e.g. 2660 // define @foo(%struct.X* %A) { 2661 // tail call @bar(%struct.X* byval %A) 2662 // } 2663 return false; 2664 SDValue Ptr = Ld->getBasePtr(); 2665 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2666 if (!FINode) 2667 return false; 2668 FI = FINode->getIndex(); 2669 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2670 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2671 FI = FINode->getIndex(); 2672 Bytes = Flags.getByValSize(); 2673 } else 2674 return false; 2675 2676 assert(FI != INT_MAX); 2677 if (!MFI->isFixedObjectIndex(FI)) 2678 return false; 2679 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2680} 2681 2682/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2683/// for tail call optimization. Targets which want to do tail call 2684/// optimization should implement this function. 2685bool 2686X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2687 CallingConv::ID CalleeCC, 2688 bool isVarArg, 2689 bool isCalleeStructRet, 2690 bool isCallerStructRet, 2691 const SmallVectorImpl<ISD::OutputArg> &Outs, 2692 const SmallVectorImpl<SDValue> &OutVals, 2693 const SmallVectorImpl<ISD::InputArg> &Ins, 2694 SelectionDAG& DAG) const { 2695 if (!IsTailCallConvention(CalleeCC) && 2696 CalleeCC != CallingConv::C) 2697 return false; 2698 2699 // If -tailcallopt is specified, make fastcc functions tail-callable. 2700 const MachineFunction &MF = DAG.getMachineFunction(); 2701 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2702 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2703 bool CCMatch = CallerCC == CalleeCC; 2704 2705 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2706 if (IsTailCallConvention(CalleeCC) && CCMatch) 2707 return true; 2708 return false; 2709 } 2710 2711 // Look for obvious safe cases to perform tail call optimization that do not 2712 // require ABI changes. This is what gcc calls sibcall. 2713 2714 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2715 // emit a special epilogue. 2716 if (RegInfo->needsStackRealignment(MF)) 2717 return false; 2718 2719 // Also avoid sibcall optimization if either caller or callee uses struct 2720 // return semantics. 2721 if (isCalleeStructRet || isCallerStructRet) 2722 return false; 2723 2724 // An stdcall caller is expected to clean up its arguments; the callee 2725 // isn't going to do that. 2726 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2727 return false; 2728 2729 // Do not sibcall optimize vararg calls unless all arguments are passed via 2730 // registers. 2731 if (isVarArg && !Outs.empty()) { 2732 2733 // Optimizing for varargs on Win64 is unlikely to be safe without 2734 // additional testing. 2735 if (Subtarget->isTargetWin64()) 2736 return false; 2737 2738 SmallVector<CCValAssign, 16> ArgLocs; 2739 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2740 getTargetMachine(), ArgLocs, *DAG.getContext()); 2741 2742 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2743 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2744 if (!ArgLocs[i].isRegLoc()) 2745 return false; 2746 } 2747 2748 // If the call result is in ST0 / ST1, it needs to be popped off the x87 2749 // stack. Therefore, if it's not used by the call it is not safe to optimize 2750 // this into a sibcall. 2751 bool Unused = false; 2752 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2753 if (!Ins[i].Used) { 2754 Unused = true; 2755 break; 2756 } 2757 } 2758 if (Unused) { 2759 SmallVector<CCValAssign, 16> RVLocs; 2760 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2761 getTargetMachine(), RVLocs, *DAG.getContext()); 2762 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2763 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2764 CCValAssign &VA = RVLocs[i]; 2765 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2766 return false; 2767 } 2768 } 2769 2770 // If the calling conventions do not match, then we'd better make sure the 2771 // results are returned in the same way as what the caller expects. 2772 if (!CCMatch) { 2773 SmallVector<CCValAssign, 16> RVLocs1; 2774 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2775 getTargetMachine(), RVLocs1, *DAG.getContext()); 2776 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2777 2778 SmallVector<CCValAssign, 16> RVLocs2; 2779 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2780 getTargetMachine(), RVLocs2, *DAG.getContext()); 2781 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2782 2783 if (RVLocs1.size() != RVLocs2.size()) 2784 return false; 2785 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2786 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2787 return false; 2788 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2789 return false; 2790 if (RVLocs1[i].isRegLoc()) { 2791 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2792 return false; 2793 } else { 2794 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2795 return false; 2796 } 2797 } 2798 } 2799 2800 // If the callee takes no arguments then go on to check the results of the 2801 // call. 2802 if (!Outs.empty()) { 2803 // Check if stack adjustment is needed. For now, do not do this if any 2804 // argument is passed on the stack. 2805 SmallVector<CCValAssign, 16> ArgLocs; 2806 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2807 getTargetMachine(), ArgLocs, *DAG.getContext()); 2808 2809 // Allocate shadow area for Win64 2810 if (Subtarget->isTargetWin64()) { 2811 CCInfo.AllocateStack(32, 8); 2812 } 2813 2814 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2815 if (CCInfo.getNextStackOffset()) { 2816 MachineFunction &MF = DAG.getMachineFunction(); 2817 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2818 return false; 2819 2820 // Check if the arguments are already laid out in the right way as 2821 // the caller's fixed stack objects. 2822 MachineFrameInfo *MFI = MF.getFrameInfo(); 2823 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2824 const X86InstrInfo *TII = 2825 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2826 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2827 CCValAssign &VA = ArgLocs[i]; 2828 SDValue Arg = OutVals[i]; 2829 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2830 if (VA.getLocInfo() == CCValAssign::Indirect) 2831 return false; 2832 if (!VA.isRegLoc()) { 2833 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2834 MFI, MRI, TII)) 2835 return false; 2836 } 2837 } 2838 } 2839 2840 // If the tailcall address may be in a register, then make sure it's 2841 // possible to register allocate for it. In 32-bit, the call address can 2842 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2843 // callee-saved registers are restored. These happen to be the same 2844 // registers used to pass 'inreg' arguments so watch out for those. 2845 if (!Subtarget->is64Bit() && 2846 !isa<GlobalAddressSDNode>(Callee) && 2847 !isa<ExternalSymbolSDNode>(Callee)) { 2848 unsigned NumInRegs = 0; 2849 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2850 CCValAssign &VA = ArgLocs[i]; 2851 if (!VA.isRegLoc()) 2852 continue; 2853 unsigned Reg = VA.getLocReg(); 2854 switch (Reg) { 2855 default: break; 2856 case X86::EAX: case X86::EDX: case X86::ECX: 2857 if (++NumInRegs == 3) 2858 return false; 2859 break; 2860 } 2861 } 2862 } 2863 } 2864 2865 return true; 2866} 2867 2868FastISel * 2869X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2870 return X86::createFastISel(funcInfo); 2871} 2872 2873 2874//===----------------------------------------------------------------------===// 2875// Other Lowering Hooks 2876//===----------------------------------------------------------------------===// 2877 2878static bool MayFoldLoad(SDValue Op) { 2879 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2880} 2881 2882static bool MayFoldIntoStore(SDValue Op) { 2883 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2884} 2885 2886static bool isTargetShuffle(unsigned Opcode) { 2887 switch(Opcode) { 2888 default: return false; 2889 case X86ISD::PSHUFD: 2890 case X86ISD::PSHUFHW: 2891 case X86ISD::PSHUFLW: 2892 case X86ISD::SHUFP: 2893 case X86ISD::PALIGN: 2894 case X86ISD::MOVLHPS: 2895 case X86ISD::MOVLHPD: 2896 case X86ISD::MOVHLPS: 2897 case X86ISD::MOVLPS: 2898 case X86ISD::MOVLPD: 2899 case X86ISD::MOVSHDUP: 2900 case X86ISD::MOVSLDUP: 2901 case X86ISD::MOVDDUP: 2902 case X86ISD::MOVSS: 2903 case X86ISD::MOVSD: 2904 case X86ISD::UNPCKL: 2905 case X86ISD::UNPCKH: 2906 case X86ISD::VPERMILP: 2907 case X86ISD::VPERM2X128: 2908 return true; 2909 } 2910} 2911 2912static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2913 SDValue V1, SelectionDAG &DAG) { 2914 switch(Opc) { 2915 default: llvm_unreachable("Unknown x86 shuffle node"); 2916 case X86ISD::MOVSHDUP: 2917 case X86ISD::MOVSLDUP: 2918 case X86ISD::MOVDDUP: 2919 return DAG.getNode(Opc, dl, VT, V1); 2920 } 2921} 2922 2923static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2924 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2925 switch(Opc) { 2926 default: llvm_unreachable("Unknown x86 shuffle node"); 2927 case X86ISD::PSHUFD: 2928 case X86ISD::PSHUFHW: 2929 case X86ISD::PSHUFLW: 2930 case X86ISD::VPERMILP: 2931 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2932 } 2933} 2934 2935static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2936 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2937 switch(Opc) { 2938 default: llvm_unreachable("Unknown x86 shuffle node"); 2939 case X86ISD::PALIGN: 2940 case X86ISD::SHUFP: 2941 case X86ISD::VPERM2X128: 2942 return DAG.getNode(Opc, dl, VT, V1, V2, 2943 DAG.getConstant(TargetMask, MVT::i8)); 2944 } 2945} 2946 2947static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2948 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2949 switch(Opc) { 2950 default: llvm_unreachable("Unknown x86 shuffle node"); 2951 case X86ISD::MOVLHPS: 2952 case X86ISD::MOVLHPD: 2953 case X86ISD::MOVHLPS: 2954 case X86ISD::MOVLPS: 2955 case X86ISD::MOVLPD: 2956 case X86ISD::MOVSS: 2957 case X86ISD::MOVSD: 2958 case X86ISD::UNPCKL: 2959 case X86ISD::UNPCKH: 2960 return DAG.getNode(Opc, dl, VT, V1, V2); 2961 } 2962} 2963 2964SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2965 MachineFunction &MF = DAG.getMachineFunction(); 2966 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2967 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2968 2969 if (ReturnAddrIndex == 0) { 2970 // Set up a frame object for the return address. 2971 uint64_t SlotSize = TD->getPointerSize(); 2972 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2973 false); 2974 FuncInfo->setRAIndex(ReturnAddrIndex); 2975 } 2976 2977 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2978} 2979 2980 2981bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2982 bool hasSymbolicDisplacement) { 2983 // Offset should fit into 32 bit immediate field. 2984 if (!isInt<32>(Offset)) 2985 return false; 2986 2987 // If we don't have a symbolic displacement - we don't have any extra 2988 // restrictions. 2989 if (!hasSymbolicDisplacement) 2990 return true; 2991 2992 // FIXME: Some tweaks might be needed for medium code model. 2993 if (M != CodeModel::Small && M != CodeModel::Kernel) 2994 return false; 2995 2996 // For small code model we assume that latest object is 16MB before end of 31 2997 // bits boundary. We may also accept pretty large negative constants knowing 2998 // that all objects are in the positive half of address space. 2999 if (M == CodeModel::Small && Offset < 16*1024*1024) 3000 return true; 3001 3002 // For kernel code model we know that all object resist in the negative half 3003 // of 32bits address space. We may not accept negative offsets, since they may 3004 // be just off and we may accept pretty large positive ones. 3005 if (M == CodeModel::Kernel && Offset > 0) 3006 return true; 3007 3008 return false; 3009} 3010 3011/// isCalleePop - Determines whether the callee is required to pop its 3012/// own arguments. Callee pop is necessary to support tail calls. 3013bool X86::isCalleePop(CallingConv::ID CallingConv, 3014 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 3015 if (IsVarArg) 3016 return false; 3017 3018 switch (CallingConv) { 3019 default: 3020 return false; 3021 case CallingConv::X86_StdCall: 3022 return !is64Bit; 3023 case CallingConv::X86_FastCall: 3024 return !is64Bit; 3025 case CallingConv::X86_ThisCall: 3026 return !is64Bit; 3027 case CallingConv::Fast: 3028 return TailCallOpt; 3029 case CallingConv::GHC: 3030 return TailCallOpt; 3031 } 3032} 3033 3034/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 3035/// specific condition code, returning the condition code and the LHS/RHS of the 3036/// comparison to make. 3037static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 3038 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 3039 if (!isFP) { 3040 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3041 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3042 // X > -1 -> X == 0, jump !sign. 3043 RHS = DAG.getConstant(0, RHS.getValueType()); 3044 return X86::COND_NS; 3045 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3046 // X < 0 -> X == 0, jump on sign. 3047 return X86::COND_S; 3048 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3049 // X < 1 -> X <= 0 3050 RHS = DAG.getConstant(0, RHS.getValueType()); 3051 return X86::COND_LE; 3052 } 3053 } 3054 3055 switch (SetCCOpcode) { 3056 default: llvm_unreachable("Invalid integer condition!"); 3057 case ISD::SETEQ: return X86::COND_E; 3058 case ISD::SETGT: return X86::COND_G; 3059 case ISD::SETGE: return X86::COND_GE; 3060 case ISD::SETLT: return X86::COND_L; 3061 case ISD::SETLE: return X86::COND_LE; 3062 case ISD::SETNE: return X86::COND_NE; 3063 case ISD::SETULT: return X86::COND_B; 3064 case ISD::SETUGT: return X86::COND_A; 3065 case ISD::SETULE: return X86::COND_BE; 3066 case ISD::SETUGE: return X86::COND_AE; 3067 } 3068 } 3069 3070 // First determine if it is required or is profitable to flip the operands. 3071 3072 // If LHS is a foldable load, but RHS is not, flip the condition. 3073 if (ISD::isNON_EXTLoad(LHS.getNode()) && 3074 !ISD::isNON_EXTLoad(RHS.getNode())) { 3075 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 3076 std::swap(LHS, RHS); 3077 } 3078 3079 switch (SetCCOpcode) { 3080 default: break; 3081 case ISD::SETOLT: 3082 case ISD::SETOLE: 3083 case ISD::SETUGT: 3084 case ISD::SETUGE: 3085 std::swap(LHS, RHS); 3086 break; 3087 } 3088 3089 // On a floating point condition, the flags are set as follows: 3090 // ZF PF CF op 3091 // 0 | 0 | 0 | X > Y 3092 // 0 | 0 | 1 | X < Y 3093 // 1 | 0 | 0 | X == Y 3094 // 1 | 1 | 1 | unordered 3095 switch (SetCCOpcode) { 3096 default: llvm_unreachable("Condcode should be pre-legalized away"); 3097 case ISD::SETUEQ: 3098 case ISD::SETEQ: return X86::COND_E; 3099 case ISD::SETOLT: // flipped 3100 case ISD::SETOGT: 3101 case ISD::SETGT: return X86::COND_A; 3102 case ISD::SETOLE: // flipped 3103 case ISD::SETOGE: 3104 case ISD::SETGE: return X86::COND_AE; 3105 case ISD::SETUGT: // flipped 3106 case ISD::SETULT: 3107 case ISD::SETLT: return X86::COND_B; 3108 case ISD::SETUGE: // flipped 3109 case ISD::SETULE: 3110 case ISD::SETLE: return X86::COND_BE; 3111 case ISD::SETONE: 3112 case ISD::SETNE: return X86::COND_NE; 3113 case ISD::SETUO: return X86::COND_P; 3114 case ISD::SETO: return X86::COND_NP; 3115 case ISD::SETOEQ: 3116 case ISD::SETUNE: return X86::COND_INVALID; 3117 } 3118} 3119 3120/// hasFPCMov - is there a floating point cmov for the specific X86 condition 3121/// code. Current x86 isa includes the following FP cmov instructions: 3122/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3123static bool hasFPCMov(unsigned X86CC) { 3124 switch (X86CC) { 3125 default: 3126 return false; 3127 case X86::COND_B: 3128 case X86::COND_BE: 3129 case X86::COND_E: 3130 case X86::COND_P: 3131 case X86::COND_A: 3132 case X86::COND_AE: 3133 case X86::COND_NE: 3134 case X86::COND_NP: 3135 return true; 3136 } 3137} 3138 3139/// isFPImmLegal - Returns true if the target can instruction select the 3140/// specified FP immediate natively. If false, the legalizer will 3141/// materialize the FP immediate as a load from a constant pool. 3142bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3143 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3144 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3145 return true; 3146 } 3147 return false; 3148} 3149 3150/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3151/// the specified range (L, H]. 3152static bool isUndefOrInRange(int Val, int Low, int Hi) { 3153 return (Val < 0) || (Val >= Low && Val < Hi); 3154} 3155 3156/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3157/// specified value. 3158static bool isUndefOrEqual(int Val, int CmpVal) { 3159 if (Val < 0 || Val == CmpVal) 3160 return true; 3161 return false; 3162} 3163 3164/// isSequentialOrUndefInRange - Return true if every element in Mask, begining 3165/// from position Pos and ending in Pos+Size, falls within the specified 3166/// sequential range (L, L+Pos]. or is undef. 3167static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 3168 int Pos, int Size, int Low) { 3169 for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3170 if (!isUndefOrEqual(Mask[i], Low)) 3171 return false; 3172 return true; 3173} 3174 3175/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3176/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3177/// the second operand. 3178static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { 3179 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3180 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3181 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3182 return (Mask[0] < 2 && Mask[1] < 2); 3183 return false; 3184} 3185 3186/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3187/// is suitable for input to PSHUFHW. 3188static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) { 3189 if (VT != MVT::v8i16) 3190 return false; 3191 3192 // Lower quadword copied in order or undef. 3193 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) 3194 return false; 3195 3196 // Upper quadword shuffled. 3197 for (unsigned i = 4; i != 8; ++i) 3198 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 3199 return false; 3200 3201 return true; 3202} 3203 3204/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3205/// is suitable for input to PSHUFLW. 3206static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) { 3207 if (VT != MVT::v8i16) 3208 return false; 3209 3210 // Upper quadword copied in order. 3211 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) 3212 return false; 3213 3214 // Lower quadword shuffled. 3215 for (unsigned i = 0; i != 4; ++i) 3216 if (Mask[i] >= 4) 3217 return false; 3218 3219 return true; 3220} 3221 3222/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3223/// is suitable for input to PALIGNR. 3224static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, 3225 const X86Subtarget *Subtarget) { 3226 if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) || 3227 (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())) 3228 return false; 3229 3230 unsigned NumElts = VT.getVectorNumElements(); 3231 unsigned NumLanes = VT.getSizeInBits()/128; 3232 unsigned NumLaneElts = NumElts/NumLanes; 3233 3234 // Do not handle 64-bit element shuffles with palignr. 3235 if (NumLaneElts == 2) 3236 return false; 3237 3238 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { 3239 unsigned i; 3240 for (i = 0; i != NumLaneElts; ++i) { 3241 if (Mask[i+l] >= 0) 3242 break; 3243 } 3244 3245 // Lane is all undef, go to next lane 3246 if (i == NumLaneElts) 3247 continue; 3248 3249 int Start = Mask[i+l]; 3250 3251 // Make sure its in this lane in one of the sources 3252 if (!isUndefOrInRange(Start, l, l+NumLaneElts) && 3253 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) 3254 return false; 3255 3256 // If not lane 0, then we must match lane 0 3257 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) 3258 return false; 3259 3260 // Correct second source to be contiguous with first source 3261 if (Start >= (int)NumElts) 3262 Start -= NumElts - NumLaneElts; 3263 3264 // Make sure we're shifting in the right direction. 3265 if (Start <= (int)(i+l)) 3266 return false; 3267 3268 Start -= i; 3269 3270 // Check the rest of the elements to see if they are consecutive. 3271 for (++i; i != NumLaneElts; ++i) { 3272 int Idx = Mask[i+l]; 3273 3274 // Make sure its in this lane 3275 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && 3276 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) 3277 return false; 3278 3279 // If not lane 0, then we must match lane 0 3280 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) 3281 return false; 3282 3283 if (Idx >= (int)NumElts) 3284 Idx -= NumElts - NumLaneElts; 3285 3286 if (!isUndefOrEqual(Idx, Start+i)) 3287 return false; 3288 3289 } 3290 } 3291 3292 return true; 3293} 3294 3295/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3296/// the two vector operands have swapped position. 3297static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, 3298 unsigned NumElems) { 3299 for (unsigned i = 0; i != NumElems; ++i) { 3300 int idx = Mask[i]; 3301 if (idx < 0) 3302 continue; 3303 else if (idx < (int)NumElems) 3304 Mask[i] = idx + NumElems; 3305 else 3306 Mask[i] = idx - NumElems; 3307 } 3308} 3309 3310/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3311/// specifies a shuffle of elements that is suitable for input to 128/256-bit 3312/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be 3313/// reverse of what x86 shuffles want. 3314static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX, 3315 bool Commuted = false) { 3316 if (!HasAVX && VT.getSizeInBits() == 256) 3317 return false; 3318 3319 unsigned NumElems = VT.getVectorNumElements(); 3320 unsigned NumLanes = VT.getSizeInBits()/128; 3321 unsigned NumLaneElems = NumElems/NumLanes; 3322 3323 if (NumLaneElems != 2 && NumLaneElems != 4) 3324 return false; 3325 3326 // VSHUFPSY divides the resulting vector into 4 chunks. 3327 // The sources are also splitted into 4 chunks, and each destination 3328 // chunk must come from a different source chunk. 3329 // 3330 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 3331 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 3332 // 3333 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 3334 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 3335 // 3336 // VSHUFPDY divides the resulting vector into 4 chunks. 3337 // The sources are also splitted into 4 chunks, and each destination 3338 // chunk must come from a different source chunk. 3339 // 3340 // SRC1 => X3 X2 X1 X0 3341 // SRC2 => Y3 Y2 Y1 Y0 3342 // 3343 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 3344 // 3345 unsigned HalfLaneElems = NumLaneElems/2; 3346 for (unsigned l = 0; l != NumElems; l += NumLaneElems) { 3347 for (unsigned i = 0; i != NumLaneElems; ++i) { 3348 int Idx = Mask[i+l]; 3349 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); 3350 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) 3351 return false; 3352 // For VSHUFPSY, the mask of the second half must be the same as the 3353 // first but with the appropriate offsets. This works in the same way as 3354 // VPERMILPS works with masks. 3355 if (NumElems != 8 || l == 0 || Mask[i] < 0) 3356 continue; 3357 if (!isUndefOrEqual(Idx, Mask[i]+l)) 3358 return false; 3359 } 3360 } 3361 3362 return true; 3363} 3364 3365/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3366/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3367static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { 3368 unsigned NumElems = VT.getVectorNumElements(); 3369 3370 if (VT.getSizeInBits() != 128) 3371 return false; 3372 3373 if (NumElems != 4) 3374 return false; 3375 3376 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3377 return isUndefOrEqual(Mask[0], 6) && 3378 isUndefOrEqual(Mask[1], 7) && 3379 isUndefOrEqual(Mask[2], 2) && 3380 isUndefOrEqual(Mask[3], 3); 3381} 3382 3383/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3384/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3385/// <2, 3, 2, 3> 3386static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { 3387 unsigned NumElems = VT.getVectorNumElements(); 3388 3389 if (VT.getSizeInBits() != 128) 3390 return false; 3391 3392 if (NumElems != 4) 3393 return false; 3394 3395 return isUndefOrEqual(Mask[0], 2) && 3396 isUndefOrEqual(Mask[1], 3) && 3397 isUndefOrEqual(Mask[2], 2) && 3398 isUndefOrEqual(Mask[3], 3); 3399} 3400 3401/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3402/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3403static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { 3404 if (VT.getSizeInBits() != 128) 3405 return false; 3406 3407 unsigned NumElems = VT.getVectorNumElements(); 3408 3409 if (NumElems != 2 && NumElems != 4) 3410 return false; 3411 3412 for (unsigned i = 0; i != NumElems/2; ++i) 3413 if (!isUndefOrEqual(Mask[i], i + NumElems)) 3414 return false; 3415 3416 for (unsigned i = NumElems/2; i != NumElems; ++i) 3417 if (!isUndefOrEqual(Mask[i], i)) 3418 return false; 3419 3420 return true; 3421} 3422 3423/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3424/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3425static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { 3426 unsigned NumElems = VT.getVectorNumElements(); 3427 3428 if ((NumElems != 2 && NumElems != 4) 3429 || VT.getSizeInBits() > 128) 3430 return false; 3431 3432 for (unsigned i = 0; i != NumElems/2; ++i) 3433 if (!isUndefOrEqual(Mask[i], i)) 3434 return false; 3435 3436 for (unsigned i = 0; i != NumElems/2; ++i) 3437 if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems)) 3438 return false; 3439 3440 return true; 3441} 3442 3443/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3444/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3445static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, 3446 bool HasAVX2, bool V2IsSplat = false) { 3447 unsigned NumElts = VT.getVectorNumElements(); 3448 3449 assert((VT.is128BitVector() || VT.is256BitVector()) && 3450 "Unsupported vector type for unpckh"); 3451 3452 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3453 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3454 return false; 3455 3456 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3457 // independently on 128-bit lanes. 3458 unsigned NumLanes = VT.getSizeInBits()/128; 3459 unsigned NumLaneElts = NumElts/NumLanes; 3460 3461 for (unsigned l = 0; l != NumLanes; ++l) { 3462 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3463 i != (l+1)*NumLaneElts; 3464 i += 2, ++j) { 3465 int BitI = Mask[i]; 3466 int BitI1 = Mask[i+1]; 3467 if (!isUndefOrEqual(BitI, j)) 3468 return false; 3469 if (V2IsSplat) { 3470 if (!isUndefOrEqual(BitI1, NumElts)) 3471 return false; 3472 } else { 3473 if (!isUndefOrEqual(BitI1, j + NumElts)) 3474 return false; 3475 } 3476 } 3477 } 3478 3479 return true; 3480} 3481 3482/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3483/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3484static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, 3485 bool HasAVX2, bool V2IsSplat = false) { 3486 unsigned NumElts = VT.getVectorNumElements(); 3487 3488 assert((VT.is128BitVector() || VT.is256BitVector()) && 3489 "Unsupported vector type for unpckh"); 3490 3491 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3492 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3493 return false; 3494 3495 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3496 // independently on 128-bit lanes. 3497 unsigned NumLanes = VT.getSizeInBits()/128; 3498 unsigned NumLaneElts = NumElts/NumLanes; 3499 3500 for (unsigned l = 0; l != NumLanes; ++l) { 3501 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3502 i != (l+1)*NumLaneElts; i += 2, ++j) { 3503 int BitI = Mask[i]; 3504 int BitI1 = Mask[i+1]; 3505 if (!isUndefOrEqual(BitI, j)) 3506 return false; 3507 if (V2IsSplat) { 3508 if (isUndefOrEqual(BitI1, NumElts)) 3509 return false; 3510 } else { 3511 if (!isUndefOrEqual(BitI1, j+NumElts)) 3512 return false; 3513 } 3514 } 3515 } 3516 return true; 3517} 3518 3519/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3520/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3521/// <0, 0, 1, 1> 3522static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, 3523 bool HasAVX2) { 3524 unsigned NumElts = VT.getVectorNumElements(); 3525 3526 assert((VT.is128BitVector() || VT.is256BitVector()) && 3527 "Unsupported vector type for unpckh"); 3528 3529 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3530 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3531 return false; 3532 3533 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 3534 // FIXME: Need a better way to get rid of this, there's no latency difference 3535 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 3536 // the former later. We should also remove the "_undef" special mask. 3537 if (NumElts == 4 && VT.getSizeInBits() == 256) 3538 return false; 3539 3540 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3541 // independently on 128-bit lanes. 3542 unsigned NumLanes = VT.getSizeInBits()/128; 3543 unsigned NumLaneElts = NumElts/NumLanes; 3544 3545 for (unsigned l = 0; l != NumLanes; ++l) { 3546 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3547 i != (l+1)*NumLaneElts; 3548 i += 2, ++j) { 3549 int BitI = Mask[i]; 3550 int BitI1 = Mask[i+1]; 3551 3552 if (!isUndefOrEqual(BitI, j)) 3553 return false; 3554 if (!isUndefOrEqual(BitI1, j)) 3555 return false; 3556 } 3557 } 3558 3559 return true; 3560} 3561 3562/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3563/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3564/// <2, 2, 3, 3> 3565static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { 3566 unsigned NumElts = VT.getVectorNumElements(); 3567 3568 assert((VT.is128BitVector() || VT.is256BitVector()) && 3569 "Unsupported vector type for unpckh"); 3570 3571 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3572 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3573 return false; 3574 3575 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3576 // independently on 128-bit lanes. 3577 unsigned NumLanes = VT.getSizeInBits()/128; 3578 unsigned NumLaneElts = NumElts/NumLanes; 3579 3580 for (unsigned l = 0; l != NumLanes; ++l) { 3581 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3582 i != (l+1)*NumLaneElts; i += 2, ++j) { 3583 int BitI = Mask[i]; 3584 int BitI1 = Mask[i+1]; 3585 if (!isUndefOrEqual(BitI, j)) 3586 return false; 3587 if (!isUndefOrEqual(BitI1, j)) 3588 return false; 3589 } 3590 } 3591 return true; 3592} 3593 3594/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3595/// specifies a shuffle of elements that is suitable for input to MOVSS, 3596/// MOVSD, and MOVD, i.e. setting the lowest element. 3597static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { 3598 if (VT.getVectorElementType().getSizeInBits() < 32) 3599 return false; 3600 if (VT.getSizeInBits() == 256) 3601 return false; 3602 3603 unsigned NumElts = VT.getVectorNumElements(); 3604 3605 if (!isUndefOrEqual(Mask[0], NumElts)) 3606 return false; 3607 3608 for (unsigned i = 1; i != NumElts; ++i) 3609 if (!isUndefOrEqual(Mask[i], i)) 3610 return false; 3611 3612 return true; 3613} 3614 3615/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered 3616/// as permutations between 128-bit chunks or halves. As an example: this 3617/// shuffle bellow: 3618/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 3619/// The first half comes from the second half of V1 and the second half from the 3620/// the second half of V2. 3621static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { 3622 if (!HasAVX || VT.getSizeInBits() != 256) 3623 return false; 3624 3625 // The shuffle result is divided into half A and half B. In total the two 3626 // sources have 4 halves, namely: C, D, E, F. The final values of A and 3627 // B must come from C, D, E or F. 3628 unsigned HalfSize = VT.getVectorNumElements()/2; 3629 bool MatchA = false, MatchB = false; 3630 3631 // Check if A comes from one of C, D, E, F. 3632 for (unsigned Half = 0; Half != 4; ++Half) { 3633 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 3634 MatchA = true; 3635 break; 3636 } 3637 } 3638 3639 // Check if B comes from one of C, D, E, F. 3640 for (unsigned Half = 0; Half != 4; ++Half) { 3641 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 3642 MatchB = true; 3643 break; 3644 } 3645 } 3646 3647 return MatchA && MatchB; 3648} 3649 3650/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle 3651/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. 3652static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { 3653 EVT VT = SVOp->getValueType(0); 3654 3655 unsigned HalfSize = VT.getVectorNumElements()/2; 3656 3657 unsigned FstHalf = 0, SndHalf = 0; 3658 for (unsigned i = 0; i < HalfSize; ++i) { 3659 if (SVOp->getMaskElt(i) > 0) { 3660 FstHalf = SVOp->getMaskElt(i)/HalfSize; 3661 break; 3662 } 3663 } 3664 for (unsigned i = HalfSize; i < HalfSize*2; ++i) { 3665 if (SVOp->getMaskElt(i) > 0) { 3666 SndHalf = SVOp->getMaskElt(i)/HalfSize; 3667 break; 3668 } 3669 } 3670 3671 return (FstHalf | (SndHalf << 4)); 3672} 3673 3674/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand 3675/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 3676/// Note that VPERMIL mask matching is different depending whether theunderlying 3677/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3678/// to the same elements of the low, but to the higher half of the source. 3679/// In VPERMILPD the two lanes could be shuffled independently of each other 3680/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. 3681static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { 3682 if (!HasAVX) 3683 return false; 3684 3685 unsigned NumElts = VT.getVectorNumElements(); 3686 // Only match 256-bit with 32/64-bit types 3687 if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) 3688 return false; 3689 3690 unsigned NumLanes = VT.getSizeInBits()/128; 3691 unsigned LaneSize = NumElts/NumLanes; 3692 for (unsigned l = 0; l != NumElts; l += LaneSize) { 3693 for (unsigned i = 0; i != LaneSize; ++i) { 3694 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 3695 return false; 3696 if (NumElts != 8 || l == 0) 3697 continue; 3698 // VPERMILPS handling 3699 if (Mask[i] < 0) 3700 continue; 3701 if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) 3702 return false; 3703 } 3704 } 3705 3706 return true; 3707} 3708 3709/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse 3710/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3711/// element of vector 2 and the other elements to come from vector 1 in order. 3712static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, 3713 bool V2IsSplat = false, bool V2IsUndef = false) { 3714 unsigned NumOps = VT.getVectorNumElements(); 3715 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3716 return false; 3717 3718 if (!isUndefOrEqual(Mask[0], 0)) 3719 return false; 3720 3721 for (unsigned i = 1; i != NumOps; ++i) 3722 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3723 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3724 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3725 return false; 3726 3727 return true; 3728} 3729 3730/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3731/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3732/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3733static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, 3734 const X86Subtarget *Subtarget) { 3735 if (!Subtarget->hasSSE3()) 3736 return false; 3737 3738 unsigned NumElems = VT.getVectorNumElements(); 3739 3740 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3741 (VT.getSizeInBits() == 256 && NumElems != 8)) 3742 return false; 3743 3744 // "i+1" is the value the indexed mask element must have 3745 for (unsigned i = 0; i != NumElems; i += 2) 3746 if (!isUndefOrEqual(Mask[i], i+1) || 3747 !isUndefOrEqual(Mask[i+1], i+1)) 3748 return false; 3749 3750 return true; 3751} 3752 3753/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3754/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3755/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3756static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, 3757 const X86Subtarget *Subtarget) { 3758 if (!Subtarget->hasSSE3()) 3759 return false; 3760 3761 unsigned NumElems = VT.getVectorNumElements(); 3762 3763 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3764 (VT.getSizeInBits() == 256 && NumElems != 8)) 3765 return false; 3766 3767 // "i" is the value the indexed mask element must have 3768 for (unsigned i = 0; i != NumElems; i += 2) 3769 if (!isUndefOrEqual(Mask[i], i) || 3770 !isUndefOrEqual(Mask[i+1], i)) 3771 return false; 3772 3773 return true; 3774} 3775 3776/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 3777/// specifies a shuffle of elements that is suitable for input to 256-bit 3778/// version of MOVDDUP. 3779static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { 3780 unsigned NumElts = VT.getVectorNumElements(); 3781 3782 if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4) 3783 return false; 3784 3785 for (unsigned i = 0; i != NumElts/2; ++i) 3786 if (!isUndefOrEqual(Mask[i], 0)) 3787 return false; 3788 for (unsigned i = NumElts/2; i != NumElts; ++i) 3789 if (!isUndefOrEqual(Mask[i], NumElts/2)) 3790 return false; 3791 return true; 3792} 3793 3794/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3795/// specifies a shuffle of elements that is suitable for input to 128-bit 3796/// version of MOVDDUP. 3797static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { 3798 if (VT.getSizeInBits() != 128) 3799 return false; 3800 3801 unsigned e = VT.getVectorNumElements() / 2; 3802 for (unsigned i = 0; i != e; ++i) 3803 if (!isUndefOrEqual(Mask[i], i)) 3804 return false; 3805 for (unsigned i = 0; i != e; ++i) 3806 if (!isUndefOrEqual(Mask[e+i], i)) 3807 return false; 3808 return true; 3809} 3810 3811/// isVEXTRACTF128Index - Return true if the specified 3812/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3813/// suitable for input to VEXTRACTF128. 3814bool X86::isVEXTRACTF128Index(SDNode *N) { 3815 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3816 return false; 3817 3818 // The index should be aligned on a 128-bit boundary. 3819 uint64_t Index = 3820 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3821 3822 unsigned VL = N->getValueType(0).getVectorNumElements(); 3823 unsigned VBits = N->getValueType(0).getSizeInBits(); 3824 unsigned ElSize = VBits / VL; 3825 bool Result = (Index * ElSize) % 128 == 0; 3826 3827 return Result; 3828} 3829 3830/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3831/// operand specifies a subvector insert that is suitable for input to 3832/// VINSERTF128. 3833bool X86::isVINSERTF128Index(SDNode *N) { 3834 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3835 return false; 3836 3837 // The index should be aligned on a 128-bit boundary. 3838 uint64_t Index = 3839 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3840 3841 unsigned VL = N->getValueType(0).getVectorNumElements(); 3842 unsigned VBits = N->getValueType(0).getSizeInBits(); 3843 unsigned ElSize = VBits / VL; 3844 bool Result = (Index * ElSize) % 128 == 0; 3845 3846 return Result; 3847} 3848 3849/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3850/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3851/// Handles 128-bit and 256-bit. 3852static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { 3853 EVT VT = N->getValueType(0); 3854 3855 assert((VT.is128BitVector() || VT.is256BitVector()) && 3856 "Unsupported vector type for PSHUF/SHUFP"); 3857 3858 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate 3859 // independently on 128-bit lanes. 3860 unsigned NumElts = VT.getVectorNumElements(); 3861 unsigned NumLanes = VT.getSizeInBits()/128; 3862 unsigned NumLaneElts = NumElts/NumLanes; 3863 3864 assert((NumLaneElts == 2 || NumLaneElts == 4) && 3865 "Only supports 2 or 4 elements per lane"); 3866 3867 unsigned Shift = (NumLaneElts == 4) ? 1 : 0; 3868 unsigned Mask = 0; 3869 for (unsigned i = 0; i != NumElts; ++i) { 3870 int Elt = N->getMaskElt(i); 3871 if (Elt < 0) continue; 3872 Elt %= NumLaneElts; 3873 unsigned ShAmt = i << Shift; 3874 if (ShAmt >= 8) ShAmt -= 8; 3875 Mask |= Elt << ShAmt; 3876 } 3877 3878 return Mask; 3879} 3880 3881/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3882/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3883static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { 3884 unsigned Mask = 0; 3885 // 8 nodes, but we only care about the last 4. 3886 for (unsigned i = 7; i >= 4; --i) { 3887 int Val = N->getMaskElt(i); 3888 if (Val >= 0) 3889 Mask |= (Val - 4); 3890 if (i != 4) 3891 Mask <<= 2; 3892 } 3893 return Mask; 3894} 3895 3896/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3897/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3898static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { 3899 unsigned Mask = 0; 3900 // 8 nodes, but we only care about the first 4. 3901 for (int i = 3; i >= 0; --i) { 3902 int Val = N->getMaskElt(i); 3903 if (Val >= 0) 3904 Mask |= Val; 3905 if (i != 0) 3906 Mask <<= 2; 3907 } 3908 return Mask; 3909} 3910 3911/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3912/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3913static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { 3914 EVT VT = SVOp->getValueType(0); 3915 unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; 3916 3917 unsigned NumElts = VT.getVectorNumElements(); 3918 unsigned NumLanes = VT.getSizeInBits()/128; 3919 unsigned NumLaneElts = NumElts/NumLanes; 3920 3921 int Val = 0; 3922 unsigned i; 3923 for (i = 0; i != NumElts; ++i) { 3924 Val = SVOp->getMaskElt(i); 3925 if (Val >= 0) 3926 break; 3927 } 3928 if (Val >= (int)NumElts) 3929 Val -= NumElts - NumLaneElts; 3930 3931 assert(Val - i > 0 && "PALIGNR imm should be positive"); 3932 return (Val - i) * EltSize; 3933} 3934 3935/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 3936/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 3937/// instructions. 3938unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 3939 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3940 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 3941 3942 uint64_t Index = 3943 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3944 3945 EVT VecVT = N->getOperand(0).getValueType(); 3946 EVT ElVT = VecVT.getVectorElementType(); 3947 3948 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3949 return Index / NumElemsPerChunk; 3950} 3951 3952/// getInsertVINSERTF128Immediate - Return the appropriate immediate 3953/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 3954/// instructions. 3955unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 3956 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3957 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 3958 3959 uint64_t Index = 3960 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3961 3962 EVT VecVT = N->getValueType(0); 3963 EVT ElVT = VecVT.getVectorElementType(); 3964 3965 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3966 return Index / NumElemsPerChunk; 3967} 3968 3969/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3970/// constant +0.0. 3971bool X86::isZeroNode(SDValue Elt) { 3972 return ((isa<ConstantSDNode>(Elt) && 3973 cast<ConstantSDNode>(Elt)->isNullValue()) || 3974 (isa<ConstantFPSDNode>(Elt) && 3975 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3976} 3977 3978/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3979/// their permute mask. 3980static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3981 SelectionDAG &DAG) { 3982 EVT VT = SVOp->getValueType(0); 3983 unsigned NumElems = VT.getVectorNumElements(); 3984 SmallVector<int, 8> MaskVec; 3985 3986 for (unsigned i = 0; i != NumElems; ++i) { 3987 int idx = SVOp->getMaskElt(i); 3988 if (idx < 0) 3989 MaskVec.push_back(idx); 3990 else if (idx < (int)NumElems) 3991 MaskVec.push_back(idx + NumElems); 3992 else 3993 MaskVec.push_back(idx - NumElems); 3994 } 3995 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3996 SVOp->getOperand(0), &MaskVec[0]); 3997} 3998 3999/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 4000/// match movhlps. The lower half elements should come from upper half of 4001/// V1 (and in order), and the upper half elements should come from the upper 4002/// half of V2 (and in order). 4003static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { 4004 if (VT.getSizeInBits() != 128) 4005 return false; 4006 if (VT.getVectorNumElements() != 4) 4007 return false; 4008 for (unsigned i = 0, e = 2; i != e; ++i) 4009 if (!isUndefOrEqual(Mask[i], i+2)) 4010 return false; 4011 for (unsigned i = 2; i != 4; ++i) 4012 if (!isUndefOrEqual(Mask[i], i+4)) 4013 return false; 4014 return true; 4015} 4016 4017/// isScalarLoadToVector - Returns true if the node is a scalar load that 4018/// is promoted to a vector. It also returns the LoadSDNode by reference if 4019/// required. 4020static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 4021 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 4022 return false; 4023 N = N->getOperand(0).getNode(); 4024 if (!ISD::isNON_EXTLoad(N)) 4025 return false; 4026 if (LD) 4027 *LD = cast<LoadSDNode>(N); 4028 return true; 4029} 4030 4031// Test whether the given value is a vector value which will be legalized 4032// into a load. 4033static bool WillBeConstantPoolLoad(SDNode *N) { 4034 if (N->getOpcode() != ISD::BUILD_VECTOR) 4035 return false; 4036 4037 // Check for any non-constant elements. 4038 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 4039 switch (N->getOperand(i).getNode()->getOpcode()) { 4040 case ISD::UNDEF: 4041 case ISD::ConstantFP: 4042 case ISD::Constant: 4043 break; 4044 default: 4045 return false; 4046 } 4047 4048 // Vectors of all-zeros and all-ones are materialized with special 4049 // instructions rather than being loaded. 4050 return !ISD::isBuildVectorAllZeros(N) && 4051 !ISD::isBuildVectorAllOnes(N); 4052} 4053 4054/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 4055/// match movlp{s|d}. The lower half elements should come from lower half of 4056/// V1 (and in order), and the upper half elements should come from the upper 4057/// half of V2 (and in order). And since V1 will become the source of the 4058/// MOVLP, it must be either a vector load or a scalar load to vector. 4059static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 4060 ArrayRef<int> Mask, EVT VT) { 4061 if (VT.getSizeInBits() != 128) 4062 return false; 4063 4064 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 4065 return false; 4066 // Is V2 is a vector load, don't do this transformation. We will try to use 4067 // load folding shufps op. 4068 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) 4069 return false; 4070 4071 unsigned NumElems = VT.getVectorNumElements(); 4072 4073 if (NumElems != 2 && NumElems != 4) 4074 return false; 4075 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4076 if (!isUndefOrEqual(Mask[i], i)) 4077 return false; 4078 for (unsigned i = NumElems/2; i != NumElems; ++i) 4079 if (!isUndefOrEqual(Mask[i], i+NumElems)) 4080 return false; 4081 return true; 4082} 4083 4084/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4085/// all the same. 4086static bool isSplatVector(SDNode *N) { 4087 if (N->getOpcode() != ISD::BUILD_VECTOR) 4088 return false; 4089 4090 SDValue SplatValue = N->getOperand(0); 4091 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4092 if (N->getOperand(i) != SplatValue) 4093 return false; 4094 return true; 4095} 4096 4097/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4098/// to an zero vector. 4099/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4100static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4101 SDValue V1 = N->getOperand(0); 4102 SDValue V2 = N->getOperand(1); 4103 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4104 for (unsigned i = 0; i != NumElems; ++i) { 4105 int Idx = N->getMaskElt(i); 4106 if (Idx >= (int)NumElems) { 4107 unsigned Opc = V2.getOpcode(); 4108 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4109 continue; 4110 if (Opc != ISD::BUILD_VECTOR || 4111 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4112 return false; 4113 } else if (Idx >= 0) { 4114 unsigned Opc = V1.getOpcode(); 4115 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4116 continue; 4117 if (Opc != ISD::BUILD_VECTOR || 4118 !X86::isZeroNode(V1.getOperand(Idx))) 4119 return false; 4120 } 4121 } 4122 return true; 4123} 4124 4125/// getZeroVector - Returns a vector of specified type with all zero elements. 4126/// 4127static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, 4128 SelectionDAG &DAG, DebugLoc dl) { 4129 assert(VT.isVector() && "Expected a vector type"); 4130 4131 // Always build SSE zero vectors as <4 x i32> bitcasted 4132 // to their dest type. This ensures they get CSE'd. 4133 SDValue Vec; 4134 if (VT.getSizeInBits() == 128) { // SSE 4135 if (Subtarget->hasSSE2()) { // SSE2 4136 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4137 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4138 } else { // SSE1 4139 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4140 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4141 } 4142 } else if (VT.getSizeInBits() == 256) { // AVX 4143 if (Subtarget->hasAVX2()) { // AVX2 4144 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4145 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4146 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4147 } else { 4148 // 256-bit logic and arithmetic instructions in AVX are all 4149 // floating-point, no support for integer ops. Emit fp zeroed vectors. 4150 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4151 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4152 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 4153 } 4154 } 4155 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4156} 4157 4158/// getOnesVector - Returns a vector of specified type with all bits set. 4159/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4160/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4161/// Then bitcast to their original type, ensuring they get CSE'd. 4162static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, 4163 DebugLoc dl) { 4164 assert(VT.isVector() && "Expected a vector type"); 4165 assert((VT.is128BitVector() || VT.is256BitVector()) 4166 && "Expected a 128-bit or 256-bit vector type"); 4167 4168 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4169 SDValue Vec; 4170 if (VT.getSizeInBits() == 256) { 4171 if (HasAVX2) { // AVX2 4172 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4173 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4174 } else { // AVX 4175 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4176 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), 4177 Vec, DAG.getConstant(0, MVT::i32), DAG, dl); 4178 Vec = Insert128BitVector(InsV, Vec, 4179 DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); 4180 } 4181 } else { 4182 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4183 } 4184 4185 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4186} 4187 4188/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4189/// that point to V2 points to its first element. 4190static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { 4191 for (unsigned i = 0; i != NumElems; ++i) { 4192 if (Mask[i] > (int)NumElems) { 4193 Mask[i] = NumElems; 4194 } 4195 } 4196} 4197 4198/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4199/// operation of specified width. 4200static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4201 SDValue V2) { 4202 unsigned NumElems = VT.getVectorNumElements(); 4203 SmallVector<int, 8> Mask; 4204 Mask.push_back(NumElems); 4205 for (unsigned i = 1; i != NumElems; ++i) 4206 Mask.push_back(i); 4207 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4208} 4209 4210/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4211static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4212 SDValue V2) { 4213 unsigned NumElems = VT.getVectorNumElements(); 4214 SmallVector<int, 8> Mask; 4215 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4216 Mask.push_back(i); 4217 Mask.push_back(i + NumElems); 4218 } 4219 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4220} 4221 4222/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4223static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4224 SDValue V2) { 4225 unsigned NumElems = VT.getVectorNumElements(); 4226 unsigned Half = NumElems/2; 4227 SmallVector<int, 8> Mask; 4228 for (unsigned i = 0; i != Half; ++i) { 4229 Mask.push_back(i + Half); 4230 Mask.push_back(i + NumElems + Half); 4231 } 4232 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4233} 4234 4235// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4236// a generic shuffle instruction because the target has no such instructions. 4237// Generate shuffles which repeat i16 and i8 several times until they can be 4238// represented by v4f32 and then be manipulated by target suported shuffles. 4239static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4240 EVT VT = V.getValueType(); 4241 int NumElems = VT.getVectorNumElements(); 4242 DebugLoc dl = V.getDebugLoc(); 4243 4244 while (NumElems > 4) { 4245 if (EltNo < NumElems/2) { 4246 V = getUnpackl(DAG, dl, VT, V, V); 4247 } else { 4248 V = getUnpackh(DAG, dl, VT, V, V); 4249 EltNo -= NumElems/2; 4250 } 4251 NumElems >>= 1; 4252 } 4253 return V; 4254} 4255 4256/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4257static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4258 EVT VT = V.getValueType(); 4259 DebugLoc dl = V.getDebugLoc(); 4260 assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) 4261 && "Vector size not supported"); 4262 4263 if (VT.getSizeInBits() == 128) { 4264 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 4265 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4266 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 4267 &SplatMask[0]); 4268 } else { 4269 // To use VPERMILPS to splat scalars, the second half of indicies must 4270 // refer to the higher part, which is a duplication of the lower one, 4271 // because VPERMILPS can only handle in-lane permutations. 4272 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4273 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4274 4275 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 4276 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 4277 &SplatMask[0]); 4278 } 4279 4280 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4281} 4282 4283/// PromoteSplat - Splat is promoted to target supported vector shuffles. 4284static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4285 EVT SrcVT = SV->getValueType(0); 4286 SDValue V1 = SV->getOperand(0); 4287 DebugLoc dl = SV->getDebugLoc(); 4288 4289 int EltNo = SV->getSplatIndex(); 4290 int NumElems = SrcVT.getVectorNumElements(); 4291 unsigned Size = SrcVT.getSizeInBits(); 4292 4293 assert(((Size == 128 && NumElems > 4) || Size == 256) && 4294 "Unknown how to promote splat for type"); 4295 4296 // Extract the 128-bit part containing the splat element and update 4297 // the splat element index when it refers to the higher register. 4298 if (Size == 256) { 4299 unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0; 4300 V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); 4301 if (Idx > 0) 4302 EltNo -= NumElems/2; 4303 } 4304 4305 // All i16 and i8 vector types can't be used directly by a generic shuffle 4306 // instruction because the target has no such instruction. Generate shuffles 4307 // which repeat i16 and i8 several times until they fit in i32, and then can 4308 // be manipulated by target suported shuffles. 4309 EVT EltVT = SrcVT.getVectorElementType(); 4310 if (EltVT == MVT::i8 || EltVT == MVT::i16) 4311 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4312 4313 // Recreate the 256-bit vector and place the same 128-bit vector 4314 // into the low and high part. This is necessary because we want 4315 // to use VPERM* to shuffle the vectors 4316 if (Size == 256) { 4317 SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, 4318 DAG.getConstant(0, MVT::i32), DAG, dl); 4319 V1 = Insert128BitVector(InsV, V1, 4320 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4321 } 4322 4323 return getLegalSplat(DAG, V1, EltNo); 4324} 4325 4326/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4327/// vector of zero or undef vector. This produces a shuffle where the low 4328/// element of V2 is swizzled into the zero/undef vector, landing at element 4329/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4330static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4331 bool IsZero, 4332 const X86Subtarget *Subtarget, 4333 SelectionDAG &DAG) { 4334 EVT VT = V2.getValueType(); 4335 SDValue V1 = IsZero 4336 ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4337 unsigned NumElems = VT.getVectorNumElements(); 4338 SmallVector<int, 16> MaskVec; 4339 for (unsigned i = 0; i != NumElems; ++i) 4340 // If this is the insertion idx, put the low elt of V2 here. 4341 MaskVec.push_back(i == Idx ? NumElems : i); 4342 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4343} 4344 4345/// getShuffleScalarElt - Returns the scalar element that will make up the ith 4346/// element of the result of the vector shuffle. 4347static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 4348 unsigned Depth) { 4349 if (Depth == 6) 4350 return SDValue(); // Limit search depth. 4351 4352 SDValue V = SDValue(N, 0); 4353 EVT VT = V.getValueType(); 4354 unsigned Opcode = V.getOpcode(); 4355 4356 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4357 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4358 Index = SV->getMaskElt(Index); 4359 4360 if (Index < 0) 4361 return DAG.getUNDEF(VT.getVectorElementType()); 4362 4363 unsigned NumElems = VT.getVectorNumElements(); 4364 SDValue NewV = (Index < (int)NumElems) ? SV->getOperand(0) 4365 : SV->getOperand(1); 4366 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 4367 } 4368 4369 // Recurse into target specific vector shuffles to find scalars. 4370 if (isTargetShuffle(Opcode)) { 4371 unsigned NumElems = VT.getVectorNumElements(); 4372 SmallVector<unsigned, 16> ShuffleMask; 4373 SDValue ImmN; 4374 4375 switch(Opcode) { 4376 case X86ISD::SHUFP: 4377 ImmN = N->getOperand(N->getNumOperands()-1); 4378 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4379 ShuffleMask); 4380 break; 4381 case X86ISD::UNPCKH: 4382 DecodeUNPCKHMask(VT, ShuffleMask); 4383 break; 4384 case X86ISD::UNPCKL: 4385 DecodeUNPCKLMask(VT, ShuffleMask); 4386 break; 4387 case X86ISD::MOVHLPS: 4388 DecodeMOVHLPSMask(NumElems, ShuffleMask); 4389 break; 4390 case X86ISD::MOVLHPS: 4391 DecodeMOVLHPSMask(NumElems, ShuffleMask); 4392 break; 4393 case X86ISD::PSHUFD: 4394 case X86ISD::VPERMILP: 4395 ImmN = N->getOperand(N->getNumOperands()-1); 4396 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4397 ShuffleMask); 4398 break; 4399 case X86ISD::PSHUFHW: 4400 ImmN = N->getOperand(N->getNumOperands()-1); 4401 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4402 ShuffleMask); 4403 break; 4404 case X86ISD::PSHUFLW: 4405 ImmN = N->getOperand(N->getNumOperands()-1); 4406 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4407 ShuffleMask); 4408 break; 4409 case X86ISD::MOVSS: 4410 case X86ISD::MOVSD: { 4411 // The index 0 always comes from the first element of the second source, 4412 // this is why MOVSS and MOVSD are used in the first place. The other 4413 // elements come from the other positions of the first source vector. 4414 unsigned OpNum = (Index == 0) ? 1 : 0; 4415 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 4416 Depth+1); 4417 } 4418 case X86ISD::VPERM2X128: 4419 ImmN = N->getOperand(N->getNumOperands()-1); 4420 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4421 ShuffleMask); 4422 break; 4423 case X86ISD::MOVDDUP: 4424 case X86ISD::MOVLHPD: 4425 case X86ISD::MOVLPD: 4426 case X86ISD::MOVLPS: 4427 case X86ISD::MOVSHDUP: 4428 case X86ISD::MOVSLDUP: 4429 case X86ISD::PALIGN: 4430 return SDValue(); // Not yet implemented. 4431 default: llvm_unreachable("unknown target shuffle node"); 4432 } 4433 4434 Index = ShuffleMask[Index]; 4435 if (Index < 0) 4436 return DAG.getUNDEF(VT.getVectorElementType()); 4437 4438 SDValue NewV = (Index < (int)NumElems) ? N->getOperand(0) 4439 : N->getOperand(1); 4440 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 4441 Depth+1); 4442 } 4443 4444 // Actual nodes that may contain scalar elements 4445 if (Opcode == ISD::BITCAST) { 4446 V = V.getOperand(0); 4447 EVT SrcVT = V.getValueType(); 4448 unsigned NumElems = VT.getVectorNumElements(); 4449 4450 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4451 return SDValue(); 4452 } 4453 4454 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4455 return (Index == 0) ? V.getOperand(0) 4456 : DAG.getUNDEF(VT.getVectorElementType()); 4457 4458 if (V.getOpcode() == ISD::BUILD_VECTOR) 4459 return V.getOperand(Index); 4460 4461 return SDValue(); 4462} 4463 4464/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4465/// shuffle operation which come from a consecutively from a zero. The 4466/// search can start in two different directions, from left or right. 4467static 4468unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 4469 bool ZerosFromLeft, SelectionDAG &DAG) { 4470 int i = 0; 4471 4472 while (i < NumElems) { 4473 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4474 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 4475 if (!(Elt.getNode() && 4476 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4477 break; 4478 ++i; 4479 } 4480 4481 return i; 4482} 4483 4484/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 4485/// MaskE correspond consecutively to elements from one of the vector operands, 4486/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4487static 4488bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 4489 int OpIdx, int NumElems, unsigned &OpNum) { 4490 bool SeenV1 = false; 4491 bool SeenV2 = false; 4492 4493 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 4494 int Idx = SVOp->getMaskElt(i); 4495 // Ignore undef indicies 4496 if (Idx < 0) 4497 continue; 4498 4499 if (Idx < NumElems) 4500 SeenV1 = true; 4501 else 4502 SeenV2 = true; 4503 4504 // Only accept consecutive elements from the same vector 4505 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4506 return false; 4507 } 4508 4509 OpNum = SeenV1 ? 0 : 1; 4510 return true; 4511} 4512 4513/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4514/// logical left shift of a vector. 4515static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4516 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4517 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4518 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4519 false /* check zeros from right */, DAG); 4520 unsigned OpSrc; 4521 4522 if (!NumZeros) 4523 return false; 4524 4525 // Considering the elements in the mask that are not consecutive zeros, 4526 // check if they consecutively come from only one of the source vectors. 4527 // 4528 // V1 = {X, A, B, C} 0 4529 // \ \ \ / 4530 // vector_shuffle V1, V2 <1, 2, 3, X> 4531 // 4532 if (!isShuffleMaskConsecutive(SVOp, 4533 0, // Mask Start Index 4534 NumElems-NumZeros-1, // Mask End Index 4535 NumZeros, // Where to start looking in the src vector 4536 NumElems, // Number of elements in vector 4537 OpSrc)) // Which source operand ? 4538 return false; 4539 4540 isLeft = false; 4541 ShAmt = NumZeros; 4542 ShVal = SVOp->getOperand(OpSrc); 4543 return true; 4544} 4545 4546/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4547/// logical left shift of a vector. 4548static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4549 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4550 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4551 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4552 true /* check zeros from left */, DAG); 4553 unsigned OpSrc; 4554 4555 if (!NumZeros) 4556 return false; 4557 4558 // Considering the elements in the mask that are not consecutive zeros, 4559 // check if they consecutively come from only one of the source vectors. 4560 // 4561 // 0 { A, B, X, X } = V2 4562 // / \ / / 4563 // vector_shuffle V1, V2 <X, X, 4, 5> 4564 // 4565 if (!isShuffleMaskConsecutive(SVOp, 4566 NumZeros, // Mask Start Index 4567 NumElems-1, // Mask End Index 4568 0, // Where to start looking in the src vector 4569 NumElems, // Number of elements in vector 4570 OpSrc)) // Which source operand ? 4571 return false; 4572 4573 isLeft = true; 4574 ShAmt = NumZeros; 4575 ShVal = SVOp->getOperand(OpSrc); 4576 return true; 4577} 4578 4579/// isVectorShift - Returns true if the shuffle can be implemented as a 4580/// logical left or right shift of a vector. 4581static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4582 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4583 // Although the logic below support any bitwidth size, there are no 4584 // shift instructions which handle more than 128-bit vectors. 4585 if (SVOp->getValueType(0).getSizeInBits() > 128) 4586 return false; 4587 4588 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4589 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4590 return true; 4591 4592 return false; 4593} 4594 4595/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4596/// 4597static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4598 unsigned NumNonZero, unsigned NumZero, 4599 SelectionDAG &DAG, 4600 const X86Subtarget* Subtarget, 4601 const TargetLowering &TLI) { 4602 if (NumNonZero > 8) 4603 return SDValue(); 4604 4605 DebugLoc dl = Op.getDebugLoc(); 4606 SDValue V(0, 0); 4607 bool First = true; 4608 for (unsigned i = 0; i < 16; ++i) { 4609 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4610 if (ThisIsNonZero && First) { 4611 if (NumZero) 4612 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4613 else 4614 V = DAG.getUNDEF(MVT::v8i16); 4615 First = false; 4616 } 4617 4618 if ((i & 1) != 0) { 4619 SDValue ThisElt(0, 0), LastElt(0, 0); 4620 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4621 if (LastIsNonZero) { 4622 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4623 MVT::i16, Op.getOperand(i-1)); 4624 } 4625 if (ThisIsNonZero) { 4626 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4627 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4628 ThisElt, DAG.getConstant(8, MVT::i8)); 4629 if (LastIsNonZero) 4630 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4631 } else 4632 ThisElt = LastElt; 4633 4634 if (ThisElt.getNode()) 4635 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4636 DAG.getIntPtrConstant(i/2)); 4637 } 4638 } 4639 4640 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4641} 4642 4643/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4644/// 4645static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4646 unsigned NumNonZero, unsigned NumZero, 4647 SelectionDAG &DAG, 4648 const X86Subtarget* Subtarget, 4649 const TargetLowering &TLI) { 4650 if (NumNonZero > 4) 4651 return SDValue(); 4652 4653 DebugLoc dl = Op.getDebugLoc(); 4654 SDValue V(0, 0); 4655 bool First = true; 4656 for (unsigned i = 0; i < 8; ++i) { 4657 bool isNonZero = (NonZeros & (1 << i)) != 0; 4658 if (isNonZero) { 4659 if (First) { 4660 if (NumZero) 4661 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 4662 else 4663 V = DAG.getUNDEF(MVT::v8i16); 4664 First = false; 4665 } 4666 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4667 MVT::v8i16, V, Op.getOperand(i), 4668 DAG.getIntPtrConstant(i)); 4669 } 4670 } 4671 4672 return V; 4673} 4674 4675/// getVShift - Return a vector logical shift node. 4676/// 4677static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4678 unsigned NumBits, SelectionDAG &DAG, 4679 const TargetLowering &TLI, DebugLoc dl) { 4680 assert(VT.getSizeInBits() == 128 && "Unknown type for VShift"); 4681 EVT ShVT = MVT::v2i64; 4682 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 4683 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4684 return DAG.getNode(ISD::BITCAST, dl, VT, 4685 DAG.getNode(Opc, dl, ShVT, SrcOp, 4686 DAG.getConstant(NumBits, 4687 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4688} 4689 4690SDValue 4691X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4692 SelectionDAG &DAG) const { 4693 4694 // Check if the scalar load can be widened into a vector load. And if 4695 // the address is "base + cst" see if the cst can be "absorbed" into 4696 // the shuffle mask. 4697 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4698 SDValue Ptr = LD->getBasePtr(); 4699 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4700 return SDValue(); 4701 EVT PVT = LD->getValueType(0); 4702 if (PVT != MVT::i32 && PVT != MVT::f32) 4703 return SDValue(); 4704 4705 int FI = -1; 4706 int64_t Offset = 0; 4707 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4708 FI = FINode->getIndex(); 4709 Offset = 0; 4710 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4711 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4712 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4713 Offset = Ptr.getConstantOperandVal(1); 4714 Ptr = Ptr.getOperand(0); 4715 } else { 4716 return SDValue(); 4717 } 4718 4719 // FIXME: 256-bit vector instructions don't require a strict alignment, 4720 // improve this code to support it better. 4721 unsigned RequiredAlign = VT.getSizeInBits()/8; 4722 SDValue Chain = LD->getChain(); 4723 // Make sure the stack object alignment is at least 16 or 32. 4724 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4725 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 4726 if (MFI->isFixedObjectIndex(FI)) { 4727 // Can't change the alignment. FIXME: It's possible to compute 4728 // the exact stack offset and reference FI + adjust offset instead. 4729 // If someone *really* cares about this. That's the way to implement it. 4730 return SDValue(); 4731 } else { 4732 MFI->setObjectAlignment(FI, RequiredAlign); 4733 } 4734 } 4735 4736 // (Offset % 16 or 32) must be multiple of 4. Then address is then 4737 // Ptr + (Offset & ~15). 4738 if (Offset < 0) 4739 return SDValue(); 4740 if ((Offset % RequiredAlign) & 3) 4741 return SDValue(); 4742 int64_t StartOffset = Offset & ~(RequiredAlign-1); 4743 if (StartOffset) 4744 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4745 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4746 4747 int EltNo = (Offset - StartOffset) >> 2; 4748 int NumElems = VT.getVectorNumElements(); 4749 4750 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 4751 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 4752 LD->getPointerInfo().getWithOffset(StartOffset), 4753 false, false, false, 0); 4754 4755 SmallVector<int, 8> Mask; 4756 for (int i = 0; i < NumElems; ++i) 4757 Mask.push_back(EltNo); 4758 4759 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 4760 } 4761 4762 return SDValue(); 4763} 4764 4765/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4766/// vector of type 'VT', see if the elements can be replaced by a single large 4767/// load which has the same value as a build_vector whose operands are 'elts'. 4768/// 4769/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4770/// 4771/// FIXME: we'd also like to handle the case where the last elements are zero 4772/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4773/// There's even a handy isZeroNode for that purpose. 4774static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4775 DebugLoc &DL, SelectionDAG &DAG) { 4776 EVT EltVT = VT.getVectorElementType(); 4777 unsigned NumElems = Elts.size(); 4778 4779 LoadSDNode *LDBase = NULL; 4780 unsigned LastLoadedElt = -1U; 4781 4782 // For each element in the initializer, see if we've found a load or an undef. 4783 // If we don't find an initial load element, or later load elements are 4784 // non-consecutive, bail out. 4785 for (unsigned i = 0; i < NumElems; ++i) { 4786 SDValue Elt = Elts[i]; 4787 4788 if (!Elt.getNode() || 4789 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4790 return SDValue(); 4791 if (!LDBase) { 4792 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4793 return SDValue(); 4794 LDBase = cast<LoadSDNode>(Elt.getNode()); 4795 LastLoadedElt = i; 4796 continue; 4797 } 4798 if (Elt.getOpcode() == ISD::UNDEF) 4799 continue; 4800 4801 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4802 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4803 return SDValue(); 4804 LastLoadedElt = i; 4805 } 4806 4807 // If we have found an entire vector of loads and undefs, then return a large 4808 // load of the entire vector width starting at the base pointer. If we found 4809 // consecutive loads for the low half, generate a vzext_load node. 4810 if (LastLoadedElt == NumElems - 1) { 4811 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4812 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4813 LDBase->getPointerInfo(), 4814 LDBase->isVolatile(), LDBase->isNonTemporal(), 4815 LDBase->isInvariant(), 0); 4816 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4817 LDBase->getPointerInfo(), 4818 LDBase->isVolatile(), LDBase->isNonTemporal(), 4819 LDBase->isInvariant(), LDBase->getAlignment()); 4820 } else if (NumElems == 4 && LastLoadedElt == 1 && 4821 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 4822 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4823 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4824 SDValue ResNode = 4825 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64, 4826 LDBase->getPointerInfo(), 4827 LDBase->getAlignment(), 4828 false/*isVolatile*/, true/*ReadMem*/, 4829 false/*WriteMem*/); 4830 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4831 } 4832 return SDValue(); 4833} 4834 4835/// isVectorBroadcast - Check if the node chain is suitable to be xformed to 4836/// a vbroadcast node. We support two patterns: 4837/// 1. A splat BUILD_VECTOR which uses a single scalar load. 4838/// 2. A splat shuffle which uses a scalar_to_vector node which comes from 4839/// a scalar load. 4840/// The scalar load node is returned when a pattern is found, 4841/// or SDValue() otherwise. 4842static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) { 4843 if (!Subtarget->hasAVX()) 4844 return SDValue(); 4845 4846 EVT VT = Op.getValueType(); 4847 SDValue V = Op; 4848 4849 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 4850 V = V.getOperand(0); 4851 4852 //A suspected load to be broadcasted. 4853 SDValue Ld; 4854 4855 switch (V.getOpcode()) { 4856 default: 4857 // Unknown pattern found. 4858 return SDValue(); 4859 4860 case ISD::BUILD_VECTOR: { 4861 // The BUILD_VECTOR node must be a splat. 4862 if (!isSplatVector(V.getNode())) 4863 return SDValue(); 4864 4865 Ld = V.getOperand(0); 4866 4867 // The suspected load node has several users. Make sure that all 4868 // of its users are from the BUILD_VECTOR node. 4869 if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) 4870 return SDValue(); 4871 break; 4872 } 4873 4874 case ISD::VECTOR_SHUFFLE: { 4875 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4876 4877 // Shuffles must have a splat mask where the first element is 4878 // broadcasted. 4879 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 4880 return SDValue(); 4881 4882 SDValue Sc = Op.getOperand(0); 4883 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR) 4884 return SDValue(); 4885 4886 Ld = Sc.getOperand(0); 4887 4888 // The scalar_to_vector node and the suspected 4889 // load node must have exactly one user. 4890 if (!Sc.hasOneUse() || !Ld.hasOneUse()) 4891 return SDValue(); 4892 break; 4893 } 4894 } 4895 4896 // The scalar source must be a normal load. 4897 if (!ISD::isNormalLoad(Ld.getNode())) 4898 return SDValue(); 4899 4900 // Reject loads that have uses of the chain result 4901 if (Ld->hasAnyUseOfValue(1)) 4902 return SDValue(); 4903 4904 bool Is256 = VT.getSizeInBits() == 256; 4905 bool Is128 = VT.getSizeInBits() == 128; 4906 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 4907 4908 // VBroadcast to YMM 4909 if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) 4910 return Ld; 4911 4912 // VBroadcast to XMM 4913 if (Is128 && (ScalarSize == 32)) 4914 return Ld; 4915 4916 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 4917 // double since there is vbroadcastsd xmm 4918 if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { 4919 // VBroadcast to YMM 4920 if (Is256 && (ScalarSize == 8 || ScalarSize == 16)) 4921 return Ld; 4922 4923 // VBroadcast to XMM 4924 if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) 4925 return Ld; 4926 } 4927 4928 // Unsupported broadcast. 4929 return SDValue(); 4930} 4931 4932SDValue 4933X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4934 DebugLoc dl = Op.getDebugLoc(); 4935 4936 EVT VT = Op.getValueType(); 4937 EVT ExtVT = VT.getVectorElementType(); 4938 unsigned NumElems = Op.getNumOperands(); 4939 4940 // Vectors containing all zeros can be matched by pxor and xorps later 4941 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 4942 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 4943 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 4944 if (VT == MVT::v4i32 || VT == MVT::v8i32) 4945 return Op; 4946 4947 return getZeroVector(VT, Subtarget, DAG, dl); 4948 } 4949 4950 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 4951 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 4952 // vpcmpeqd on 256-bit vectors. 4953 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 4954 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2())) 4955 return Op; 4956 4957 return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl); 4958 } 4959 4960 SDValue LD = isVectorBroadcast(Op, Subtarget); 4961 if (LD.getNode()) 4962 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); 4963 4964 unsigned EVTBits = ExtVT.getSizeInBits(); 4965 4966 unsigned NumZero = 0; 4967 unsigned NumNonZero = 0; 4968 unsigned NonZeros = 0; 4969 bool IsAllConstants = true; 4970 SmallSet<SDValue, 8> Values; 4971 for (unsigned i = 0; i < NumElems; ++i) { 4972 SDValue Elt = Op.getOperand(i); 4973 if (Elt.getOpcode() == ISD::UNDEF) 4974 continue; 4975 Values.insert(Elt); 4976 if (Elt.getOpcode() != ISD::Constant && 4977 Elt.getOpcode() != ISD::ConstantFP) 4978 IsAllConstants = false; 4979 if (X86::isZeroNode(Elt)) 4980 NumZero++; 4981 else { 4982 NonZeros |= (1 << i); 4983 NumNonZero++; 4984 } 4985 } 4986 4987 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4988 if (NumNonZero == 0) 4989 return DAG.getUNDEF(VT); 4990 4991 // Special case for single non-zero, non-undef, element. 4992 if (NumNonZero == 1) { 4993 unsigned Idx = CountTrailingZeros_32(NonZeros); 4994 SDValue Item = Op.getOperand(Idx); 4995 4996 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4997 // the value are obviously zero, truncate the value to i32 and do the 4998 // insertion that way. Only do this if the value is non-constant or if the 4999 // value is a constant being inserted into element 0. It is cheaper to do 5000 // a constant pool load than it is to do a movd + shuffle. 5001 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5002 (!IsAllConstants || Idx == 0)) { 5003 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5004 // Handle SSE only. 5005 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5006 EVT VecVT = MVT::v4i32; 5007 unsigned VecElts = 4; 5008 5009 // Truncate the value (which may itself be a constant) to i32, and 5010 // convert it to a vector with movd (S2V+shuffle to zero extend). 5011 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5012 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5013 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5014 5015 // Now we have our 32-bit value zero extended in the low element of 5016 // a vector. If Idx != 0, swizzle it into place. 5017 if (Idx != 0) { 5018 SmallVector<int, 4> Mask; 5019 Mask.push_back(Idx); 5020 for (unsigned i = 1; i != VecElts; ++i) 5021 Mask.push_back(i); 5022 Item = DAG.getVectorShuffle(VecVT, dl, Item, 5023 DAG.getUNDEF(Item.getValueType()), 5024 &Mask[0]); 5025 } 5026 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5027 } 5028 } 5029 5030 // If we have a constant or non-constant insertion into the low element of 5031 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5032 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5033 // depending on what the source datatype is. 5034 if (Idx == 0) { 5035 if (NumZero == 0) 5036 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5037 5038 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5039 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5040 if (VT.getSizeInBits() == 256) { 5041 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 5042 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 5043 Item, DAG.getIntPtrConstant(0)); 5044 } 5045 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 5046 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5047 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5048 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5049 } 5050 5051 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5052 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5053 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 5054 if (VT.getSizeInBits() == 256) { 5055 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 5056 Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32), 5057 DAG, dl); 5058 } else { 5059 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 5060 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 5061 } 5062 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5063 } 5064 } 5065 5066 // Is it a vector logical left shift? 5067 if (NumElems == 2 && Idx == 1 && 5068 X86::isZeroNode(Op.getOperand(0)) && 5069 !X86::isZeroNode(Op.getOperand(1))) { 5070 unsigned NumBits = VT.getSizeInBits(); 5071 return getVShift(true, VT, 5072 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5073 VT, Op.getOperand(1)), 5074 NumBits/2, DAG, *this, dl); 5075 } 5076 5077 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5078 return SDValue(); 5079 5080 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5081 // is a non-constant being inserted into an element other than the low one, 5082 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5083 // movd/movss) to move this into the low element, then shuffle it into 5084 // place. 5085 if (EVTBits == 32) { 5086 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5087 5088 // Turn it into a shuffle of zero and zero-extended scalar to vector. 5089 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); 5090 SmallVector<int, 8> MaskVec; 5091 for (unsigned i = 0; i < NumElems; i++) 5092 MaskVec.push_back(i == Idx ? 0 : 1); 5093 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 5094 } 5095 } 5096 5097 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5098 if (Values.size() == 1) { 5099 if (EVTBits == 32) { 5100 // Instead of a shuffle like this: 5101 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5102 // Check if it's possible to issue this instead. 5103 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5104 unsigned Idx = CountTrailingZeros_32(NonZeros); 5105 SDValue Item = Op.getOperand(Idx); 5106 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5107 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5108 } 5109 return SDValue(); 5110 } 5111 5112 // A vector full of immediates; various special cases are already 5113 // handled, so this is best done with a single constant-pool load. 5114 if (IsAllConstants) 5115 return SDValue(); 5116 5117 // For AVX-length vectors, build the individual 128-bit pieces and use 5118 // shuffles to put them in place. 5119 if (VT.getSizeInBits() == 256) { 5120 SmallVector<SDValue, 32> V; 5121 for (unsigned i = 0; i != NumElems; ++i) 5122 V.push_back(Op.getOperand(i)); 5123 5124 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 5125 5126 // Build both the lower and upper subvector. 5127 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 5128 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 5129 NumElems/2); 5130 5131 // Recreate the wider vector with the lower and upper part. 5132 SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower, 5133 DAG.getConstant(0, MVT::i32), DAG, dl); 5134 return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32), 5135 DAG, dl); 5136 } 5137 5138 // Let legalizer expand 2-wide build_vectors. 5139 if (EVTBits == 64) { 5140 if (NumNonZero == 1) { 5141 // One half is zero or undef. 5142 unsigned Idx = CountTrailingZeros_32(NonZeros); 5143 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 5144 Op.getOperand(Idx)); 5145 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 5146 } 5147 return SDValue(); 5148 } 5149 5150 // If element VT is < 32 bits, convert it to inserts into a zero vector. 5151 if (EVTBits == 8 && NumElems == 16) { 5152 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 5153 Subtarget, *this); 5154 if (V.getNode()) return V; 5155 } 5156 5157 if (EVTBits == 16 && NumElems == 8) { 5158 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 5159 Subtarget, *this); 5160 if (V.getNode()) return V; 5161 } 5162 5163 // If element VT is == 32 bits, turn it into a number of shuffles. 5164 SmallVector<SDValue, 8> V(NumElems); 5165 if (NumElems == 4 && NumZero > 0) { 5166 for (unsigned i = 0; i < 4; ++i) { 5167 bool isZero = !(NonZeros & (1 << i)); 5168 if (isZero) 5169 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 5170 else 5171 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5172 } 5173 5174 for (unsigned i = 0; i < 2; ++i) { 5175 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 5176 default: break; 5177 case 0: 5178 V[i] = V[i*2]; // Must be a zero vector. 5179 break; 5180 case 1: 5181 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 5182 break; 5183 case 2: 5184 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 5185 break; 5186 case 3: 5187 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 5188 break; 5189 } 5190 } 5191 5192 bool Reverse1 = (NonZeros & 0x3) == 2; 5193 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 5194 int MaskVec[] = { 5195 Reverse1 ? 1 : 0, 5196 Reverse1 ? 0 : 1, 5197 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 5198 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 5199 }; 5200 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 5201 } 5202 5203 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 5204 // Check for a build vector of consecutive loads. 5205 for (unsigned i = 0; i < NumElems; ++i) 5206 V[i] = Op.getOperand(i); 5207 5208 // Check for elements which are consecutive loads. 5209 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 5210 if (LD.getNode()) 5211 return LD; 5212 5213 // For SSE 4.1, use insertps to put the high elements into the low element. 5214 if (getSubtarget()->hasSSE41()) { 5215 SDValue Result; 5216 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 5217 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 5218 else 5219 Result = DAG.getUNDEF(VT); 5220 5221 for (unsigned i = 1; i < NumElems; ++i) { 5222 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 5223 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 5224 Op.getOperand(i), DAG.getIntPtrConstant(i)); 5225 } 5226 return Result; 5227 } 5228 5229 // Otherwise, expand into a number of unpckl*, start by extending each of 5230 // our (non-undef) elements to the full vector width with the element in the 5231 // bottom slot of the vector (which generates no code for SSE). 5232 for (unsigned i = 0; i < NumElems; ++i) { 5233 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 5234 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5235 else 5236 V[i] = DAG.getUNDEF(VT); 5237 } 5238 5239 // Next, we iteratively mix elements, e.g. for v4f32: 5240 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 5241 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 5242 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 5243 unsigned EltStride = NumElems >> 1; 5244 while (EltStride != 0) { 5245 for (unsigned i = 0; i < EltStride; ++i) { 5246 // If V[i+EltStride] is undef and this is the first round of mixing, 5247 // then it is safe to just drop this shuffle: V[i] is already in the 5248 // right place, the one element (since it's the first round) being 5249 // inserted as undef can be dropped. This isn't safe for successive 5250 // rounds because they will permute elements within both vectors. 5251 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 5252 EltStride == NumElems/2) 5253 continue; 5254 5255 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 5256 } 5257 EltStride >>= 1; 5258 } 5259 return V[0]; 5260 } 5261 return SDValue(); 5262} 5263 5264// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place 5265// them in a MMX register. This is better than doing a stack convert. 5266static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5267 DebugLoc dl = Op.getDebugLoc(); 5268 EVT ResVT = Op.getValueType(); 5269 5270 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 5271 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 5272 int Mask[2]; 5273 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 5274 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 5275 InVec = Op.getOperand(1); 5276 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 5277 unsigned NumElts = ResVT.getVectorNumElements(); 5278 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 5279 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 5280 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 5281 } else { 5282 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 5283 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 5284 Mask[0] = 0; Mask[1] = 2; 5285 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 5286 } 5287 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 5288} 5289 5290// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 5291// to create 256-bit vectors from two other 128-bit ones. 5292static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5293 DebugLoc dl = Op.getDebugLoc(); 5294 EVT ResVT = Op.getValueType(); 5295 5296 assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide"); 5297 5298 SDValue V1 = Op.getOperand(0); 5299 SDValue V2 = Op.getOperand(1); 5300 unsigned NumElems = ResVT.getVectorNumElements(); 5301 5302 SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1, 5303 DAG.getConstant(0, MVT::i32), DAG, dl); 5304 return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), 5305 DAG, dl); 5306} 5307 5308SDValue 5309X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 5310 EVT ResVT = Op.getValueType(); 5311 5312 assert(Op.getNumOperands() == 2); 5313 assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) && 5314 "Unsupported CONCAT_VECTORS for value type"); 5315 5316 // We support concatenate two MMX registers and place them in a MMX register. 5317 // This is better than doing a stack convert. 5318 if (ResVT.is128BitVector()) 5319 return LowerMMXCONCAT_VECTORS(Op, DAG); 5320 5321 // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors 5322 // from two other 128-bit ones. 5323 return LowerAVXCONCAT_VECTORS(Op, DAG); 5324} 5325 5326// v8i16 shuffles - Prefer shuffles in the following order: 5327// 1. [all] pshuflw, pshufhw, optional move 5328// 2. [ssse3] 1 x pshufb 5329// 3. [ssse3] 2 x pshufb + 1 x por 5330// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 5331SDValue 5332X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 5333 SelectionDAG &DAG) const { 5334 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5335 SDValue V1 = SVOp->getOperand(0); 5336 SDValue V2 = SVOp->getOperand(1); 5337 DebugLoc dl = SVOp->getDebugLoc(); 5338 SmallVector<int, 8> MaskVals; 5339 5340 // Determine if more than 1 of the words in each of the low and high quadwords 5341 // of the result come from the same quadword of one of the two inputs. Undef 5342 // mask values count as coming from any quadword, for better codegen. 5343 unsigned LoQuad[] = { 0, 0, 0, 0 }; 5344 unsigned HiQuad[] = { 0, 0, 0, 0 }; 5345 std::bitset<4> InputQuads; 5346 for (unsigned i = 0; i < 8; ++i) { 5347 unsigned *Quad = i < 4 ? LoQuad : HiQuad; 5348 int EltIdx = SVOp->getMaskElt(i); 5349 MaskVals.push_back(EltIdx); 5350 if (EltIdx < 0) { 5351 ++Quad[0]; 5352 ++Quad[1]; 5353 ++Quad[2]; 5354 ++Quad[3]; 5355 continue; 5356 } 5357 ++Quad[EltIdx / 4]; 5358 InputQuads.set(EltIdx / 4); 5359 } 5360 5361 int BestLoQuad = -1; 5362 unsigned MaxQuad = 1; 5363 for (unsigned i = 0; i < 4; ++i) { 5364 if (LoQuad[i] > MaxQuad) { 5365 BestLoQuad = i; 5366 MaxQuad = LoQuad[i]; 5367 } 5368 } 5369 5370 int BestHiQuad = -1; 5371 MaxQuad = 1; 5372 for (unsigned i = 0; i < 4; ++i) { 5373 if (HiQuad[i] > MaxQuad) { 5374 BestHiQuad = i; 5375 MaxQuad = HiQuad[i]; 5376 } 5377 } 5378 5379 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 5380 // of the two input vectors, shuffle them into one input vector so only a 5381 // single pshufb instruction is necessary. If There are more than 2 input 5382 // quads, disable the next transformation since it does not help SSSE3. 5383 bool V1Used = InputQuads[0] || InputQuads[1]; 5384 bool V2Used = InputQuads[2] || InputQuads[3]; 5385 if (Subtarget->hasSSSE3()) { 5386 if (InputQuads.count() == 2 && V1Used && V2Used) { 5387 BestLoQuad = InputQuads[0] ? 0 : 1; 5388 BestHiQuad = InputQuads[2] ? 2 : 3; 5389 } 5390 if (InputQuads.count() > 2) { 5391 BestLoQuad = -1; 5392 BestHiQuad = -1; 5393 } 5394 } 5395 5396 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 5397 // the shuffle mask. If a quad is scored as -1, that means that it contains 5398 // words from all 4 input quadwords. 5399 SDValue NewV; 5400 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 5401 int MaskV[] = { 5402 BestLoQuad < 0 ? 0 : BestLoQuad, 5403 BestHiQuad < 0 ? 1 : BestHiQuad 5404 }; 5405 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 5406 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 5407 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 5408 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 5409 5410 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 5411 // source words for the shuffle, to aid later transformations. 5412 bool AllWordsInNewV = true; 5413 bool InOrder[2] = { true, true }; 5414 for (unsigned i = 0; i != 8; ++i) { 5415 int idx = MaskVals[i]; 5416 if (idx != (int)i) 5417 InOrder[i/4] = false; 5418 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 5419 continue; 5420 AllWordsInNewV = false; 5421 break; 5422 } 5423 5424 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5425 if (AllWordsInNewV) { 5426 for (int i = 0; i != 8; ++i) { 5427 int idx = MaskVals[i]; 5428 if (idx < 0) 5429 continue; 5430 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5431 if ((idx != i) && idx < 4) 5432 pshufhw = false; 5433 if ((idx != i) && idx > 3) 5434 pshuflw = false; 5435 } 5436 V1 = NewV; 5437 V2Used = false; 5438 BestLoQuad = 0; 5439 BestHiQuad = 1; 5440 } 5441 5442 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5443 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5444 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5445 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5446 unsigned TargetMask = 0; 5447 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5448 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5449 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5450 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): 5451 getShufflePSHUFLWImmediate(SVOp); 5452 V1 = NewV.getOperand(0); 5453 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5454 } 5455 } 5456 5457 // If we have SSSE3, and all words of the result are from 1 input vector, 5458 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5459 // is present, fall back to case 4. 5460 if (Subtarget->hasSSSE3()) { 5461 SmallVector<SDValue,16> pshufbMask; 5462 5463 // If we have elements from both input vectors, set the high bit of the 5464 // shuffle mask element to zero out elements that come from V2 in the V1 5465 // mask, and elements that come from V1 in the V2 mask, so that the two 5466 // results can be OR'd together. 5467 bool TwoInputs = V1Used && V2Used; 5468 for (unsigned i = 0; i != 8; ++i) { 5469 int EltIdx = MaskVals[i] * 2; 5470 if (TwoInputs && (EltIdx >= 16)) { 5471 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5472 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5473 continue; 5474 } 5475 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5476 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 5477 } 5478 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5479 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5480 DAG.getNode(ISD::BUILD_VECTOR, dl, 5481 MVT::v16i8, &pshufbMask[0], 16)); 5482 if (!TwoInputs) 5483 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5484 5485 // Calculate the shuffle mask for the second input, shuffle it, and 5486 // OR it with the first shuffled input. 5487 pshufbMask.clear(); 5488 for (unsigned i = 0; i != 8; ++i) { 5489 int EltIdx = MaskVals[i] * 2; 5490 if (EltIdx < 16) { 5491 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5492 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5493 continue; 5494 } 5495 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5496 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 5497 } 5498 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5499 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5500 DAG.getNode(ISD::BUILD_VECTOR, dl, 5501 MVT::v16i8, &pshufbMask[0], 16)); 5502 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5503 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5504 } 5505 5506 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5507 // and update MaskVals with new element order. 5508 std::bitset<8> InOrder; 5509 if (BestLoQuad >= 0) { 5510 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; 5511 for (int i = 0; i != 4; ++i) { 5512 int idx = MaskVals[i]; 5513 if (idx < 0) { 5514 InOrder.set(i); 5515 } else if ((idx / 4) == BestLoQuad) { 5516 MaskV[i] = idx & 3; 5517 InOrder.set(i); 5518 } 5519 } 5520 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5521 &MaskV[0]); 5522 5523 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 5524 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5525 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5526 NewV.getOperand(0), 5527 getShufflePSHUFLWImmediate(SVOp), DAG); 5528 } 5529 } 5530 5531 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5532 // and update MaskVals with the new element order. 5533 if (BestHiQuad >= 0) { 5534 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; 5535 for (unsigned i = 4; i != 8; ++i) { 5536 int idx = MaskVals[i]; 5537 if (idx < 0) { 5538 InOrder.set(i); 5539 } else if ((idx / 4) == BestHiQuad) { 5540 MaskV[i] = (idx & 3) + 4; 5541 InOrder.set(i); 5542 } 5543 } 5544 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5545 &MaskV[0]); 5546 5547 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { 5548 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 5549 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5550 NewV.getOperand(0), 5551 getShufflePSHUFHWImmediate(SVOp), DAG); 5552 } 5553 } 5554 5555 // In case BestHi & BestLo were both -1, which means each quadword has a word 5556 // from each of the four input quadwords, calculate the InOrder bitvector now 5557 // before falling through to the insert/extract cleanup. 5558 if (BestLoQuad == -1 && BestHiQuad == -1) { 5559 NewV = V1; 5560 for (int i = 0; i != 8; ++i) 5561 if (MaskVals[i] < 0 || MaskVals[i] == i) 5562 InOrder.set(i); 5563 } 5564 5565 // The other elements are put in the right place using pextrw and pinsrw. 5566 for (unsigned i = 0; i != 8; ++i) { 5567 if (InOrder[i]) 5568 continue; 5569 int EltIdx = MaskVals[i]; 5570 if (EltIdx < 0) 5571 continue; 5572 SDValue ExtOp = (EltIdx < 8) 5573 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5574 DAG.getIntPtrConstant(EltIdx)) 5575 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 5576 DAG.getIntPtrConstant(EltIdx - 8)); 5577 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 5578 DAG.getIntPtrConstant(i)); 5579 } 5580 return NewV; 5581} 5582 5583// v16i8 shuffles - Prefer shuffles in the following order: 5584// 1. [ssse3] 1 x pshufb 5585// 2. [ssse3] 2 x pshufb + 1 x por 5586// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 5587static 5588SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 5589 SelectionDAG &DAG, 5590 const X86TargetLowering &TLI) { 5591 SDValue V1 = SVOp->getOperand(0); 5592 SDValue V2 = SVOp->getOperand(1); 5593 DebugLoc dl = SVOp->getDebugLoc(); 5594 ArrayRef<int> MaskVals = SVOp->getMask(); 5595 5596 // If we have SSSE3, case 1 is generated when all result bytes come from 5597 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5598 // present, fall back to case 3. 5599 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 5600 bool V1Only = true; 5601 bool V2Only = true; 5602 for (unsigned i = 0; i < 16; ++i) { 5603 int EltIdx = MaskVals[i]; 5604 if (EltIdx < 0) 5605 continue; 5606 if (EltIdx < 16) 5607 V2Only = false; 5608 else 5609 V1Only = false; 5610 } 5611 5612 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5613 if (TLI.getSubtarget()->hasSSSE3()) { 5614 SmallVector<SDValue,16> pshufbMask; 5615 5616 // If all result elements are from one input vector, then only translate 5617 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5618 // 5619 // Otherwise, we have elements from both input vectors, and must zero out 5620 // elements that come from V2 in the first mask, and V1 in the second mask 5621 // so that we can OR them together. 5622 bool TwoInputs = !(V1Only || V2Only); 5623 for (unsigned i = 0; i != 16; ++i) { 5624 int EltIdx = MaskVals[i]; 5625 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 5626 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5627 continue; 5628 } 5629 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5630 } 5631 // If all the elements are from V2, assign it to V1 and return after 5632 // building the first pshufb. 5633 if (V2Only) 5634 V1 = V2; 5635 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5636 DAG.getNode(ISD::BUILD_VECTOR, dl, 5637 MVT::v16i8, &pshufbMask[0], 16)); 5638 if (!TwoInputs) 5639 return V1; 5640 5641 // Calculate the shuffle mask for the second input, shuffle it, and 5642 // OR it with the first shuffled input. 5643 pshufbMask.clear(); 5644 for (unsigned i = 0; i != 16; ++i) { 5645 int EltIdx = MaskVals[i]; 5646 if (EltIdx < 16) { 5647 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5648 continue; 5649 } 5650 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5651 } 5652 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5653 DAG.getNode(ISD::BUILD_VECTOR, dl, 5654 MVT::v16i8, &pshufbMask[0], 16)); 5655 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5656 } 5657 5658 // No SSSE3 - Calculate in place words and then fix all out of place words 5659 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5660 // the 16 different words that comprise the two doublequadword input vectors. 5661 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5662 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5663 SDValue NewV = V2Only ? V2 : V1; 5664 for (int i = 0; i != 8; ++i) { 5665 int Elt0 = MaskVals[i*2]; 5666 int Elt1 = MaskVals[i*2+1]; 5667 5668 // This word of the result is all undef, skip it. 5669 if (Elt0 < 0 && Elt1 < 0) 5670 continue; 5671 5672 // This word of the result is already in the correct place, skip it. 5673 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 5674 continue; 5675 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 5676 continue; 5677 5678 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5679 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5680 SDValue InsElt; 5681 5682 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5683 // using a single extract together, load it and store it. 5684 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5685 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5686 DAG.getIntPtrConstant(Elt1 / 2)); 5687 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5688 DAG.getIntPtrConstant(i)); 5689 continue; 5690 } 5691 5692 // If Elt1 is defined, extract it from the appropriate source. If the 5693 // source byte is not also odd, shift the extracted word left 8 bits 5694 // otherwise clear the bottom 8 bits if we need to do an or. 5695 if (Elt1 >= 0) { 5696 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5697 DAG.getIntPtrConstant(Elt1 / 2)); 5698 if ((Elt1 & 1) == 0) 5699 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5700 DAG.getConstant(8, 5701 TLI.getShiftAmountTy(InsElt.getValueType()))); 5702 else if (Elt0 >= 0) 5703 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5704 DAG.getConstant(0xFF00, MVT::i16)); 5705 } 5706 // If Elt0 is defined, extract it from the appropriate source. If the 5707 // source byte is not also even, shift the extracted word right 8 bits. If 5708 // Elt1 was also defined, OR the extracted values together before 5709 // inserting them in the result. 5710 if (Elt0 >= 0) { 5711 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5712 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5713 if ((Elt0 & 1) != 0) 5714 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5715 DAG.getConstant(8, 5716 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5717 else if (Elt1 >= 0) 5718 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5719 DAG.getConstant(0x00FF, MVT::i16)); 5720 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5721 : InsElt0; 5722 } 5723 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5724 DAG.getIntPtrConstant(i)); 5725 } 5726 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5727} 5728 5729/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 5730/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 5731/// done when every pair / quad of shuffle mask elements point to elements in 5732/// the right sequence. e.g. 5733/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 5734static 5735SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 5736 SelectionDAG &DAG, DebugLoc dl) { 5737 EVT VT = SVOp->getValueType(0); 5738 SDValue V1 = SVOp->getOperand(0); 5739 SDValue V2 = SVOp->getOperand(1); 5740 unsigned NumElems = VT.getVectorNumElements(); 5741 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 5742 EVT NewVT; 5743 switch (VT.getSimpleVT().SimpleTy) { 5744 default: llvm_unreachable("Unexpected!"); 5745 case MVT::v4f32: NewVT = MVT::v2f64; break; 5746 case MVT::v4i32: NewVT = MVT::v2i64; break; 5747 case MVT::v8i16: NewVT = MVT::v4i32; break; 5748 case MVT::v16i8: NewVT = MVT::v4i32; break; 5749 } 5750 5751 int Scale = NumElems / NewWidth; 5752 SmallVector<int, 8> MaskVec; 5753 for (unsigned i = 0; i < NumElems; i += Scale) { 5754 int StartIdx = -1; 5755 for (int j = 0; j < Scale; ++j) { 5756 int EltIdx = SVOp->getMaskElt(i+j); 5757 if (EltIdx < 0) 5758 continue; 5759 if (StartIdx == -1) 5760 StartIdx = EltIdx - (EltIdx % Scale); 5761 if (EltIdx != StartIdx + j) 5762 return SDValue(); 5763 } 5764 if (StartIdx == -1) 5765 MaskVec.push_back(-1); 5766 else 5767 MaskVec.push_back(StartIdx / Scale); 5768 } 5769 5770 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 5771 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 5772 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 5773} 5774 5775/// getVZextMovL - Return a zero-extending vector move low node. 5776/// 5777static SDValue getVZextMovL(EVT VT, EVT OpVT, 5778 SDValue SrcOp, SelectionDAG &DAG, 5779 const X86Subtarget *Subtarget, DebugLoc dl) { 5780 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 5781 LoadSDNode *LD = NULL; 5782 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 5783 LD = dyn_cast<LoadSDNode>(SrcOp); 5784 if (!LD) { 5785 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 5786 // instead. 5787 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 5788 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 5789 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 5790 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 5791 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 5792 // PR2108 5793 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 5794 return DAG.getNode(ISD::BITCAST, dl, VT, 5795 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5796 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5797 OpVT, 5798 SrcOp.getOperand(0) 5799 .getOperand(0)))); 5800 } 5801 } 5802 } 5803 5804 return DAG.getNode(ISD::BITCAST, dl, VT, 5805 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5806 DAG.getNode(ISD::BITCAST, dl, 5807 OpVT, SrcOp))); 5808} 5809 5810/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 5811/// which could not be matched by any known target speficic shuffle 5812static SDValue 5813LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5814 EVT VT = SVOp->getValueType(0); 5815 5816 unsigned NumElems = VT.getVectorNumElements(); 5817 unsigned NumLaneElems = NumElems / 2; 5818 5819 int MinRange[2][2] = { { static_cast<int>(NumElems), 5820 static_cast<int>(NumElems) }, 5821 { static_cast<int>(NumElems), 5822 static_cast<int>(NumElems) } }; 5823 int MaxRange[2][2] = { { -1, -1 }, { -1, -1 } }; 5824 5825 // Collect used ranges for each source in each lane 5826 for (unsigned l = 0; l < 2; ++l) { 5827 unsigned LaneStart = l*NumLaneElems; 5828 for (unsigned i = 0; i != NumLaneElems; ++i) { 5829 int Idx = SVOp->getMaskElt(i+LaneStart); 5830 if (Idx < 0) 5831 continue; 5832 5833 int Input = 0; 5834 if (Idx >= (int)NumElems) { 5835 Idx -= NumElems; 5836 Input = 1; 5837 } 5838 5839 if (Idx > MaxRange[l][Input]) 5840 MaxRange[l][Input] = Idx; 5841 if (Idx < MinRange[l][Input]) 5842 MinRange[l][Input] = Idx; 5843 } 5844 } 5845 5846 // Make sure each range is 128-bits 5847 int ExtractIdx[2][2] = { { -1, -1 }, { -1, -1 } }; 5848 for (unsigned l = 0; l < 2; ++l) { 5849 for (unsigned Input = 0; Input < 2; ++Input) { 5850 if (MinRange[l][Input] == (int)NumElems && MaxRange[l][Input] < 0) 5851 continue; 5852 5853 if (MinRange[l][Input] >= 0 && MaxRange[l][Input] < (int)NumLaneElems) 5854 ExtractIdx[l][Input] = 0; 5855 else if (MinRange[l][Input] >= (int)NumLaneElems && 5856 MaxRange[l][Input] < (int)NumElems) 5857 ExtractIdx[l][Input] = NumLaneElems; 5858 else 5859 return SDValue(); 5860 } 5861 } 5862 5863 DebugLoc dl = SVOp->getDebugLoc(); 5864 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 5865 EVT NVT = MVT::getVectorVT(EltVT, NumElems/2); 5866 5867 SDValue Ops[2][2]; 5868 for (unsigned l = 0; l < 2; ++l) { 5869 for (unsigned Input = 0; Input < 2; ++Input) { 5870 if (ExtractIdx[l][Input] >= 0) 5871 Ops[l][Input] = Extract128BitVector(SVOp->getOperand(Input), 5872 DAG.getConstant(ExtractIdx[l][Input], MVT::i32), 5873 DAG, dl); 5874 else 5875 Ops[l][Input] = DAG.getUNDEF(NVT); 5876 } 5877 } 5878 5879 // Generate 128-bit shuffles 5880 SmallVector<int, 16> Mask1, Mask2; 5881 for (unsigned i = 0; i != NumLaneElems; ++i) { 5882 int Elt = SVOp->getMaskElt(i); 5883 if (Elt >= (int)NumElems) { 5884 Elt %= NumLaneElems; 5885 Elt += NumLaneElems; 5886 } else if (Elt >= 0) { 5887 Elt %= NumLaneElems; 5888 } 5889 Mask1.push_back(Elt); 5890 } 5891 for (unsigned i = NumLaneElems; i != NumElems; ++i) { 5892 int Elt = SVOp->getMaskElt(i); 5893 if (Elt >= (int)NumElems) { 5894 Elt %= NumLaneElems; 5895 Elt += NumLaneElems; 5896 } else if (Elt >= 0) { 5897 Elt %= NumLaneElems; 5898 } 5899 Mask2.push_back(Elt); 5900 } 5901 5902 SDValue Shuf1 = DAG.getVectorShuffle(NVT, dl, Ops[0][0], Ops[0][1], &Mask1[0]); 5903 SDValue Shuf2 = DAG.getVectorShuffle(NVT, dl, Ops[1][0], Ops[1][1], &Mask2[0]); 5904 5905 // Concatenate the result back 5906 SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shuf1, 5907 DAG.getConstant(0, MVT::i32), DAG, dl); 5908 return Insert128BitVector(V, Shuf2, DAG.getConstant(NumElems/2, MVT::i32), 5909 DAG, dl); 5910} 5911 5912/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 5913/// 4 elements, and match them with several different shuffle types. 5914static SDValue 5915LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5916 SDValue V1 = SVOp->getOperand(0); 5917 SDValue V2 = SVOp->getOperand(1); 5918 DebugLoc dl = SVOp->getDebugLoc(); 5919 EVT VT = SVOp->getValueType(0); 5920 5921 assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); 5922 5923 std::pair<int, int> Locs[4]; 5924 int Mask1[] = { -1, -1, -1, -1 }; 5925 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); 5926 5927 unsigned NumHi = 0; 5928 unsigned NumLo = 0; 5929 for (unsigned i = 0; i != 4; ++i) { 5930 int Idx = PermMask[i]; 5931 if (Idx < 0) { 5932 Locs[i] = std::make_pair(-1, -1); 5933 } else { 5934 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 5935 if (Idx < 4) { 5936 Locs[i] = std::make_pair(0, NumLo); 5937 Mask1[NumLo] = Idx; 5938 NumLo++; 5939 } else { 5940 Locs[i] = std::make_pair(1, NumHi); 5941 if (2+NumHi < 4) 5942 Mask1[2+NumHi] = Idx; 5943 NumHi++; 5944 } 5945 } 5946 } 5947 5948 if (NumLo <= 2 && NumHi <= 2) { 5949 // If no more than two elements come from either vector. This can be 5950 // implemented with two shuffles. First shuffle gather the elements. 5951 // The second shuffle, which takes the first shuffle as both of its 5952 // vector operands, put the elements into the right order. 5953 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5954 5955 int Mask2[] = { -1, -1, -1, -1 }; 5956 5957 for (unsigned i = 0; i != 4; ++i) 5958 if (Locs[i].first != -1) { 5959 unsigned Idx = (i < 2) ? 0 : 4; 5960 Idx += Locs[i].first * 2 + Locs[i].second; 5961 Mask2[i] = Idx; 5962 } 5963 5964 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 5965 } else if (NumLo == 3 || NumHi == 3) { 5966 // Otherwise, we must have three elements from one vector, call it X, and 5967 // one element from the other, call it Y. First, use a shufps to build an 5968 // intermediate vector with the one element from Y and the element from X 5969 // that will be in the same half in the final destination (the indexes don't 5970 // matter). Then, use a shufps to build the final vector, taking the half 5971 // containing the element from Y from the intermediate, and the other half 5972 // from X. 5973 if (NumHi == 3) { 5974 // Normalize it so the 3 elements come from V1. 5975 CommuteVectorShuffleMask(PermMask, 4); 5976 std::swap(V1, V2); 5977 } 5978 5979 // Find the element from V2. 5980 unsigned HiIndex; 5981 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 5982 int Val = PermMask[HiIndex]; 5983 if (Val < 0) 5984 continue; 5985 if (Val >= 4) 5986 break; 5987 } 5988 5989 Mask1[0] = PermMask[HiIndex]; 5990 Mask1[1] = -1; 5991 Mask1[2] = PermMask[HiIndex^1]; 5992 Mask1[3] = -1; 5993 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5994 5995 if (HiIndex >= 2) { 5996 Mask1[0] = PermMask[0]; 5997 Mask1[1] = PermMask[1]; 5998 Mask1[2] = HiIndex & 1 ? 6 : 4; 5999 Mask1[3] = HiIndex & 1 ? 4 : 6; 6000 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6001 } else { 6002 Mask1[0] = HiIndex & 1 ? 2 : 0; 6003 Mask1[1] = HiIndex & 1 ? 0 : 2; 6004 Mask1[2] = PermMask[2]; 6005 Mask1[3] = PermMask[3]; 6006 if (Mask1[2] >= 0) 6007 Mask1[2] += 4; 6008 if (Mask1[3] >= 0) 6009 Mask1[3] += 4; 6010 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 6011 } 6012 } 6013 6014 // Break it into (shuffle shuffle_hi, shuffle_lo). 6015 int LoMask[] = { -1, -1, -1, -1 }; 6016 int HiMask[] = { -1, -1, -1, -1 }; 6017 6018 int *MaskPtr = LoMask; 6019 unsigned MaskIdx = 0; 6020 unsigned LoIdx = 0; 6021 unsigned HiIdx = 2; 6022 for (unsigned i = 0; i != 4; ++i) { 6023 if (i == 2) { 6024 MaskPtr = HiMask; 6025 MaskIdx = 1; 6026 LoIdx = 0; 6027 HiIdx = 2; 6028 } 6029 int Idx = PermMask[i]; 6030 if (Idx < 0) { 6031 Locs[i] = std::make_pair(-1, -1); 6032 } else if (Idx < 4) { 6033 Locs[i] = std::make_pair(MaskIdx, LoIdx); 6034 MaskPtr[LoIdx] = Idx; 6035 LoIdx++; 6036 } else { 6037 Locs[i] = std::make_pair(MaskIdx, HiIdx); 6038 MaskPtr[HiIdx] = Idx; 6039 HiIdx++; 6040 } 6041 } 6042 6043 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 6044 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 6045 int MaskOps[] = { -1, -1, -1, -1 }; 6046 for (unsigned i = 0; i != 4; ++i) 6047 if (Locs[i].first != -1) 6048 MaskOps[i] = Locs[i].first * 4 + Locs[i].second; 6049 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 6050} 6051 6052static bool MayFoldVectorLoad(SDValue V) { 6053 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6054 V = V.getOperand(0); 6055 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6056 V = V.getOperand(0); 6057 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && 6058 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) 6059 // BUILD_VECTOR (load), undef 6060 V = V.getOperand(0); 6061 if (MayFoldLoad(V)) 6062 return true; 6063 return false; 6064} 6065 6066// FIXME: the version above should always be used. Since there's 6067// a bug where several vector shuffles can't be folded because the 6068// DAG is not updated during lowering and a node claims to have two 6069// uses while it only has one, use this version, and let isel match 6070// another instruction if the load really happens to have more than 6071// one use. Remove this version after this bug get fixed. 6072// rdar://8434668, PR8156 6073static bool RelaxedMayFoldVectorLoad(SDValue V) { 6074 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6075 V = V.getOperand(0); 6076 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6077 V = V.getOperand(0); 6078 if (ISD::isNormalLoad(V.getNode())) 6079 return true; 6080 return false; 6081} 6082 6083/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 6084/// a vector extract, and if both can be later optimized into a single load. 6085/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 6086/// here because otherwise a target specific shuffle node is going to be 6087/// emitted for this shuffle, and the optimization not done. 6088/// FIXME: This is probably not the best approach, but fix the problem 6089/// until the right path is decided. 6090static 6091bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 6092 const TargetLowering &TLI) { 6093 EVT VT = V.getValueType(); 6094 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 6095 6096 // Be sure that the vector shuffle is present in a pattern like this: 6097 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 6098 if (!V.hasOneUse()) 6099 return false; 6100 6101 SDNode *N = *V.getNode()->use_begin(); 6102 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 6103 return false; 6104 6105 SDValue EltNo = N->getOperand(1); 6106 if (!isa<ConstantSDNode>(EltNo)) 6107 return false; 6108 6109 // If the bit convert changed the number of elements, it is unsafe 6110 // to examine the mask. 6111 bool HasShuffleIntoBitcast = false; 6112 if (V.getOpcode() == ISD::BITCAST) { 6113 EVT SrcVT = V.getOperand(0).getValueType(); 6114 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 6115 return false; 6116 V = V.getOperand(0); 6117 HasShuffleIntoBitcast = true; 6118 } 6119 6120 // Select the input vector, guarding against out of range extract vector. 6121 unsigned NumElems = VT.getVectorNumElements(); 6122 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 6123 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 6124 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 6125 6126 // If we are accessing the upper part of a YMM register 6127 // then the EXTRACT_VECTOR_ELT is likely to be legalized to a sequence of 6128 // EXTRACT_SUBVECTOR + EXTRACT_VECTOR_ELT, which are not detected at this point 6129 // because the legalization of N did not happen yet. 6130 if (Idx >= (int)NumElems/2 && VT.getSizeInBits() == 256) 6131 return false; 6132 6133 // Skip one more bit_convert if necessary 6134 if (V.getOpcode() == ISD::BITCAST) { 6135 if (!V.hasOneUse()) 6136 return false; 6137 V = V.getOperand(0); 6138 } 6139 6140 if (!ISD::isNormalLoad(V.getNode())) 6141 return false; 6142 6143 // Is the original load suitable? 6144 LoadSDNode *LN0 = cast<LoadSDNode>(V); 6145 6146 if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile()) 6147 return false; 6148 6149 if (!HasShuffleIntoBitcast) 6150 return true; 6151 6152 // If there's a bitcast before the shuffle, check if the load type and 6153 // alignment is valid. 6154 unsigned Align = LN0->getAlignment(); 6155 unsigned NewAlign = 6156 TLI.getTargetData()->getABITypeAlignment( 6157 VT.getTypeForEVT(*DAG.getContext())); 6158 6159 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 6160 return false; 6161 6162 return true; 6163} 6164 6165static 6166SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 6167 EVT VT = Op.getValueType(); 6168 6169 // Canonizalize to v2f64. 6170 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 6171 return DAG.getNode(ISD::BITCAST, dl, VT, 6172 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 6173 V1, DAG)); 6174} 6175 6176static 6177SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 6178 bool HasSSE2) { 6179 SDValue V1 = Op.getOperand(0); 6180 SDValue V2 = Op.getOperand(1); 6181 EVT VT = Op.getValueType(); 6182 6183 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 6184 6185 if (HasSSE2 && VT == MVT::v2f64) 6186 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 6187 6188 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) 6189 return DAG.getNode(ISD::BITCAST, dl, VT, 6190 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, 6191 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), 6192 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); 6193} 6194 6195static 6196SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 6197 SDValue V1 = Op.getOperand(0); 6198 SDValue V2 = Op.getOperand(1); 6199 EVT VT = Op.getValueType(); 6200 6201 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 6202 "unsupported shuffle type"); 6203 6204 if (V2.getOpcode() == ISD::UNDEF) 6205 V2 = V1; 6206 6207 // v4i32 or v4f32 6208 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 6209} 6210 6211static 6212SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 6213 SDValue V1 = Op.getOperand(0); 6214 SDValue V2 = Op.getOperand(1); 6215 EVT VT = Op.getValueType(); 6216 unsigned NumElems = VT.getVectorNumElements(); 6217 6218 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 6219 // operand of these instructions is only memory, so check if there's a 6220 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 6221 // same masks. 6222 bool CanFoldLoad = false; 6223 6224 // Trivial case, when V2 comes from a load. 6225 if (MayFoldVectorLoad(V2)) 6226 CanFoldLoad = true; 6227 6228 // When V1 is a load, it can be folded later into a store in isel, example: 6229 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 6230 // turns into: 6231 // (MOVLPSmr addr:$src1, VR128:$src2) 6232 // So, recognize this potential and also use MOVLPS or MOVLPD 6233 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 6234 CanFoldLoad = true; 6235 6236 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6237 if (CanFoldLoad) { 6238 if (HasSSE2 && NumElems == 2) 6239 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 6240 6241 if (NumElems == 4) 6242 // If we don't care about the second element, procede to use movss. 6243 if (SVOp->getMaskElt(1) != -1) 6244 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 6245 } 6246 6247 // movl and movlp will both match v2i64, but v2i64 is never matched by 6248 // movl earlier because we make it strict to avoid messing with the movlp load 6249 // folding logic (see the code above getMOVLP call). Match it here then, 6250 // this is horrible, but will stay like this until we move all shuffle 6251 // matching to x86 specific nodes. Note that for the 1st condition all 6252 // types are matched with movsd. 6253 if (HasSSE2) { 6254 // FIXME: isMOVLMask should be checked and matched before getMOVLP, 6255 // as to remove this logic from here, as much as possible 6256 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) 6257 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6258 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6259 } 6260 6261 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 6262 6263 // Invert the operand order and use SHUFPS to match it. 6264 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, 6265 getShuffleSHUFImmediate(SVOp), DAG); 6266} 6267 6268static 6269SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 6270 const TargetLowering &TLI, 6271 const X86Subtarget *Subtarget) { 6272 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6273 EVT VT = Op.getValueType(); 6274 DebugLoc dl = Op.getDebugLoc(); 6275 SDValue V1 = Op.getOperand(0); 6276 SDValue V2 = Op.getOperand(1); 6277 6278 if (isZeroShuffle(SVOp)) 6279 return getZeroVector(VT, Subtarget, DAG, dl); 6280 6281 // Handle splat operations 6282 if (SVOp->isSplat()) { 6283 unsigned NumElem = VT.getVectorNumElements(); 6284 int Size = VT.getSizeInBits(); 6285 // Special case, this is the only place now where it's allowed to return 6286 // a vector_shuffle operation without using a target specific node, because 6287 // *hopefully* it will be optimized away by the dag combiner. FIXME: should 6288 // this be moved to DAGCombine instead? 6289 if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 6290 return Op; 6291 6292 // Use vbroadcast whenever the splat comes from a foldable load 6293 SDValue LD = isVectorBroadcast(Op, Subtarget); 6294 if (LD.getNode()) 6295 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); 6296 6297 // Handle splats by matching through known shuffle masks 6298 if ((Size == 128 && NumElem <= 4) || 6299 (Size == 256 && NumElem < 8)) 6300 return SDValue(); 6301 6302 // All remaning splats are promoted to target supported vector shuffles. 6303 return PromoteSplat(SVOp, DAG); 6304 } 6305 6306 // If the shuffle can be profitably rewritten as a narrower shuffle, then 6307 // do it! 6308 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 6309 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6310 if (NewOp.getNode()) 6311 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 6312 } else if ((VT == MVT::v4i32 || 6313 (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 6314 // FIXME: Figure out a cleaner way to do this. 6315 // Try to make use of movq to zero out the top part. 6316 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 6317 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6318 if (NewOp.getNode()) { 6319 EVT NewVT = NewOp.getValueType(); 6320 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), 6321 NewVT, true, false)) 6322 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), 6323 DAG, Subtarget, dl); 6324 } 6325 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 6326 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6327 if (NewOp.getNode()) { 6328 EVT NewVT = NewOp.getValueType(); 6329 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) 6330 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), 6331 DAG, Subtarget, dl); 6332 } 6333 } 6334 } 6335 return SDValue(); 6336} 6337 6338SDValue 6339X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 6340 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6341 SDValue V1 = Op.getOperand(0); 6342 SDValue V2 = Op.getOperand(1); 6343 EVT VT = Op.getValueType(); 6344 DebugLoc dl = Op.getDebugLoc(); 6345 unsigned NumElems = VT.getVectorNumElements(); 6346 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 6347 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6348 bool V1IsSplat = false; 6349 bool V2IsSplat = false; 6350 bool HasSSE2 = Subtarget->hasSSE2(); 6351 bool HasAVX = Subtarget->hasAVX(); 6352 bool HasAVX2 = Subtarget->hasAVX2(); 6353 MachineFunction &MF = DAG.getMachineFunction(); 6354 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 6355 6356 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 6357 6358 if (V1IsUndef && V2IsUndef) 6359 return DAG.getUNDEF(VT); 6360 6361 assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); 6362 6363 // Vector shuffle lowering takes 3 steps: 6364 // 6365 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 6366 // narrowing and commutation of operands should be handled. 6367 // 2) Matching of shuffles with known shuffle masks to x86 target specific 6368 // shuffle nodes. 6369 // 3) Rewriting of unmatched masks into new generic shuffle operations, 6370 // so the shuffle can be broken into other shuffles and the legalizer can 6371 // try the lowering again. 6372 // 6373 // The general idea is that no vector_shuffle operation should be left to 6374 // be matched during isel, all of them must be converted to a target specific 6375 // node here. 6376 6377 // Normalize the input vectors. Here splats, zeroed vectors, profitable 6378 // narrowing and commutation of operands should be handled. The actual code 6379 // doesn't include all of those, work in progress... 6380 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 6381 if (NewOp.getNode()) 6382 return NewOp; 6383 6384 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); 6385 6386 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 6387 // unpckh_undef). Only use pshufd if speed is more important than size. 6388 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) 6389 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6390 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) 6391 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6392 6393 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && 6394 V2IsUndef && RelaxedMayFoldVectorLoad(V1)) 6395 return getMOVDDup(Op, dl, V1, DAG); 6396 6397 if (isMOVHLPS_v_undef_Mask(M, VT)) 6398 return getMOVHighToLow(Op, dl, DAG); 6399 6400 // Use to match splats 6401 if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef && 6402 (VT == MVT::v2f64 || VT == MVT::v2i64)) 6403 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6404 6405 if (isPSHUFDMask(M, VT)) { 6406 // The actual implementation will match the mask in the if above and then 6407 // during isel it can match several different instructions, not only pshufd 6408 // as its name says, sad but true, emulate the behavior for now... 6409 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 6410 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 6411 6412 unsigned TargetMask = getShuffleSHUFImmediate(SVOp); 6413 6414 if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64)) 6415 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG); 6416 6417 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 6418 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 6419 6420 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, 6421 TargetMask, DAG); 6422 } 6423 6424 // Check if this can be converted into a logical shift. 6425 bool isLeft = false; 6426 unsigned ShAmt = 0; 6427 SDValue ShVal; 6428 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 6429 if (isShift && ShVal.hasOneUse()) { 6430 // If the shifted value has multiple uses, it may be cheaper to use 6431 // v_set0 + movlhps or movhlps, etc. 6432 EVT EltVT = VT.getVectorElementType(); 6433 ShAmt *= EltVT.getSizeInBits(); 6434 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6435 } 6436 6437 if (isMOVLMask(M, VT)) { 6438 if (ISD::isBuildVectorAllZeros(V1.getNode())) 6439 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 6440 if (!isMOVLPMask(M, VT)) { 6441 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6442 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6443 6444 if (VT == MVT::v4i32 || VT == MVT::v4f32) 6445 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6446 } 6447 } 6448 6449 // FIXME: fold these into legal mask. 6450 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2)) 6451 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 6452 6453 if (isMOVHLPSMask(M, VT)) 6454 return getMOVHighToLow(Op, dl, DAG); 6455 6456 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) 6457 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 6458 6459 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) 6460 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 6461 6462 if (isMOVLPMask(M, VT)) 6463 return getMOVLP(Op, dl, DAG, HasSSE2); 6464 6465 if (ShouldXformToMOVHLPS(M, VT) || 6466 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) 6467 return CommuteVectorShuffle(SVOp, DAG); 6468 6469 if (isShift) { 6470 // No better options. Use a vshldq / vsrldq. 6471 EVT EltVT = VT.getVectorElementType(); 6472 ShAmt *= EltVT.getSizeInBits(); 6473 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6474 } 6475 6476 bool Commuted = false; 6477 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6478 // 1,1,1,1 -> v8i16 though. 6479 V1IsSplat = isSplatVector(V1.getNode()); 6480 V2IsSplat = isSplatVector(V2.getNode()); 6481 6482 // Canonicalize the splat or undef, if present, to be on the RHS. 6483 if (!V2IsUndef && V1IsSplat && !V2IsSplat) { 6484 CommuteVectorShuffleMask(M, NumElems); 6485 std::swap(V1, V2); 6486 std::swap(V1IsSplat, V2IsSplat); 6487 Commuted = true; 6488 } 6489 6490 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { 6491 // Shuffling low element of v1 into undef, just return v1. 6492 if (V2IsUndef) 6493 return V1; 6494 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6495 // the instruction selector will not match, so get a canonical MOVL with 6496 // swapped operands to undo the commute. 6497 return getMOVL(DAG, dl, VT, V2, V1); 6498 } 6499 6500 if (isUNPCKLMask(M, VT, HasAVX2)) 6501 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6502 6503 if (isUNPCKHMask(M, VT, HasAVX2)) 6504 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6505 6506 if (V2IsSplat) { 6507 // Normalize mask so all entries that point to V2 points to its first 6508 // element then try to match unpck{h|l} again. If match, return a 6509 // new vector_shuffle with the corrected mask.p 6510 SmallVector<int, 8> NewMask(M.begin(), M.end()); 6511 NormalizeMask(NewMask, NumElems); 6512 if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) { 6513 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6514 } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) { 6515 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6516 } 6517 } 6518 6519 if (Commuted) { 6520 // Commute is back and try unpck* again. 6521 // FIXME: this seems wrong. 6522 CommuteVectorShuffleMask(M, NumElems); 6523 std::swap(V1, V2); 6524 std::swap(V1IsSplat, V2IsSplat); 6525 Commuted = false; 6526 6527 if (isUNPCKLMask(M, VT, HasAVX2)) 6528 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6529 6530 if (isUNPCKHMask(M, VT, HasAVX2)) 6531 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6532 } 6533 6534 // Normalize the node to match x86 shuffle ops if needed 6535 if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true))) 6536 return CommuteVectorShuffle(SVOp, DAG); 6537 6538 // The checks below are all present in isShuffleMaskLegal, but they are 6539 // inlined here right now to enable us to directly emit target specific 6540 // nodes, and remove one by one until they don't return Op anymore. 6541 6542 if (isPALIGNRMask(M, VT, Subtarget)) 6543 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 6544 getShufflePALIGNRImmediate(SVOp), 6545 DAG); 6546 6547 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6548 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6549 if (VT == MVT::v2f64 || VT == MVT::v2i64) 6550 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6551 } 6552 6553 if (isPSHUFHWMask(M, VT)) 6554 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6555 getShufflePSHUFHWImmediate(SVOp), 6556 DAG); 6557 6558 if (isPSHUFLWMask(M, VT)) 6559 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 6560 getShufflePSHUFLWImmediate(SVOp), 6561 DAG); 6562 6563 if (isSHUFPMask(M, VT, HasAVX)) 6564 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, 6565 getShuffleSHUFImmediate(SVOp), DAG); 6566 6567 if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) 6568 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6569 if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) 6570 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6571 6572 //===--------------------------------------------------------------------===// 6573 // Generate target specific nodes for 128 or 256-bit shuffles only 6574 // supported in the AVX instruction set. 6575 // 6576 6577 // Handle VMOVDDUPY permutations 6578 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX)) 6579 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 6580 6581 // Handle VPERMILPS/D* permutations 6582 if (isVPERMILPMask(M, VT, HasAVX)) { 6583 if (HasAVX2 && VT == MVT::v8i32) 6584 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, 6585 getShuffleSHUFImmediate(SVOp), DAG); 6586 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, 6587 getShuffleSHUFImmediate(SVOp), DAG); 6588 } 6589 6590 // Handle VPERM2F128/VPERM2I128 permutations 6591 if (isVPERM2X128Mask(M, VT, HasAVX)) 6592 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, 6593 V2, getShuffleVPERM2X128Immediate(SVOp), DAG); 6594 6595 //===--------------------------------------------------------------------===// 6596 // Since no target specific shuffle was selected for this generic one, 6597 // lower it into other known shuffles. FIXME: this isn't true yet, but 6598 // this is the plan. 6599 // 6600 6601 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 6602 if (VT == MVT::v8i16) { 6603 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 6604 if (NewOp.getNode()) 6605 return NewOp; 6606 } 6607 6608 if (VT == MVT::v16i8) { 6609 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 6610 if (NewOp.getNode()) 6611 return NewOp; 6612 } 6613 6614 // Handle all 128-bit wide vectors with 4 elements, and match them with 6615 // several different shuffle types. 6616 if (NumElems == 4 && VT.getSizeInBits() == 128) 6617 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 6618 6619 // Handle general 256-bit shuffles 6620 if (VT.is256BitVector()) 6621 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 6622 6623 return SDValue(); 6624} 6625 6626SDValue 6627X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 6628 SelectionDAG &DAG) const { 6629 EVT VT = Op.getValueType(); 6630 DebugLoc dl = Op.getDebugLoc(); 6631 6632 if (Op.getOperand(0).getValueType().getSizeInBits() != 128) 6633 return SDValue(); 6634 6635 if (VT.getSizeInBits() == 8) { 6636 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 6637 Op.getOperand(0), Op.getOperand(1)); 6638 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6639 DAG.getValueType(VT)); 6640 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6641 } else if (VT.getSizeInBits() == 16) { 6642 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6643 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 6644 if (Idx == 0) 6645 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6646 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6647 DAG.getNode(ISD::BITCAST, dl, 6648 MVT::v4i32, 6649 Op.getOperand(0)), 6650 Op.getOperand(1))); 6651 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 6652 Op.getOperand(0), Op.getOperand(1)); 6653 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6654 DAG.getValueType(VT)); 6655 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6656 } else if (VT == MVT::f32) { 6657 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 6658 // the result back to FR32 register. It's only worth matching if the 6659 // result has a single use which is a store or a bitcast to i32. And in 6660 // the case of a store, it's not worth it if the index is a constant 0, 6661 // because a MOVSSmr can be used instead, which is smaller and faster. 6662 if (!Op.hasOneUse()) 6663 return SDValue(); 6664 SDNode *User = *Op.getNode()->use_begin(); 6665 if ((User->getOpcode() != ISD::STORE || 6666 (isa<ConstantSDNode>(Op.getOperand(1)) && 6667 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 6668 (User->getOpcode() != ISD::BITCAST || 6669 User->getValueType(0) != MVT::i32)) 6670 return SDValue(); 6671 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6672 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 6673 Op.getOperand(0)), 6674 Op.getOperand(1)); 6675 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 6676 } else if (VT == MVT::i32 || VT == MVT::i64) { 6677 // ExtractPS/pextrq works with constant index. 6678 if (isa<ConstantSDNode>(Op.getOperand(1))) 6679 return Op; 6680 } 6681 return SDValue(); 6682} 6683 6684 6685SDValue 6686X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6687 SelectionDAG &DAG) const { 6688 if (!isa<ConstantSDNode>(Op.getOperand(1))) 6689 return SDValue(); 6690 6691 SDValue Vec = Op.getOperand(0); 6692 EVT VecVT = Vec.getValueType(); 6693 6694 // If this is a 256-bit vector result, first extract the 128-bit vector and 6695 // then extract the element from the 128-bit vector. 6696 if (VecVT.getSizeInBits() == 256) { 6697 DebugLoc dl = Op.getNode()->getDebugLoc(); 6698 unsigned NumElems = VecVT.getVectorNumElements(); 6699 SDValue Idx = Op.getOperand(1); 6700 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 6701 6702 // Get the 128-bit vector. 6703 bool Upper = IdxVal >= NumElems/2; 6704 Vec = Extract128BitVector(Vec, 6705 DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl); 6706 6707 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6708 Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx); 6709 } 6710 6711 assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); 6712 6713 if (Subtarget->hasSSE41()) { 6714 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6715 if (Res.getNode()) 6716 return Res; 6717 } 6718 6719 EVT VT = Op.getValueType(); 6720 DebugLoc dl = Op.getDebugLoc(); 6721 // TODO: handle v16i8. 6722 if (VT.getSizeInBits() == 16) { 6723 SDValue Vec = Op.getOperand(0); 6724 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6725 if (Idx == 0) 6726 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6727 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6728 DAG.getNode(ISD::BITCAST, dl, 6729 MVT::v4i32, Vec), 6730 Op.getOperand(1))); 6731 // Transform it so it match pextrw which produces a 32-bit result. 6732 EVT EltVT = MVT::i32; 6733 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6734 Op.getOperand(0), Op.getOperand(1)); 6735 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6736 DAG.getValueType(VT)); 6737 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6738 } else if (VT.getSizeInBits() == 32) { 6739 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6740 if (Idx == 0) 6741 return Op; 6742 6743 // SHUFPS the element to the lowest double word, then movss. 6744 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 6745 EVT VVT = Op.getOperand(0).getValueType(); 6746 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6747 DAG.getUNDEF(VVT), Mask); 6748 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6749 DAG.getIntPtrConstant(0)); 6750 } else if (VT.getSizeInBits() == 64) { 6751 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6752 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6753 // to match extract_elt for f64. 6754 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6755 if (Idx == 0) 6756 return Op; 6757 6758 // UNPCKHPD the element to the lowest double word, then movsd. 6759 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6760 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6761 int Mask[2] = { 1, -1 }; 6762 EVT VVT = Op.getOperand(0).getValueType(); 6763 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6764 DAG.getUNDEF(VVT), Mask); 6765 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6766 DAG.getIntPtrConstant(0)); 6767 } 6768 6769 return SDValue(); 6770} 6771 6772SDValue 6773X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 6774 SelectionDAG &DAG) const { 6775 EVT VT = Op.getValueType(); 6776 EVT EltVT = VT.getVectorElementType(); 6777 DebugLoc dl = Op.getDebugLoc(); 6778 6779 SDValue N0 = Op.getOperand(0); 6780 SDValue N1 = Op.getOperand(1); 6781 SDValue N2 = Op.getOperand(2); 6782 6783 if (VT.getSizeInBits() == 256) 6784 return SDValue(); 6785 6786 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 6787 isa<ConstantSDNode>(N2)) { 6788 unsigned Opc; 6789 if (VT == MVT::v8i16) 6790 Opc = X86ISD::PINSRW; 6791 else if (VT == MVT::v16i8) 6792 Opc = X86ISD::PINSRB; 6793 else 6794 Opc = X86ISD::PINSRB; 6795 6796 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 6797 // argument. 6798 if (N1.getValueType() != MVT::i32) 6799 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6800 if (N2.getValueType() != MVT::i32) 6801 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6802 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 6803 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 6804 // Bits [7:6] of the constant are the source select. This will always be 6805 // zero here. The DAG Combiner may combine an extract_elt index into these 6806 // bits. For example (insert (extract, 3), 2) could be matched by putting 6807 // the '3' into bits [7:6] of X86ISD::INSERTPS. 6808 // Bits [5:4] of the constant are the destination select. This is the 6809 // value of the incoming immediate. 6810 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 6811 // combine either bitwise AND or insert of float 0.0 to set these bits. 6812 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 6813 // Create this as a scalar to vector.. 6814 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 6815 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 6816 } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && 6817 isa<ConstantSDNode>(N2)) { 6818 // PINSR* works with constant index. 6819 return Op; 6820 } 6821 return SDValue(); 6822} 6823 6824SDValue 6825X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 6826 EVT VT = Op.getValueType(); 6827 EVT EltVT = VT.getVectorElementType(); 6828 6829 DebugLoc dl = Op.getDebugLoc(); 6830 SDValue N0 = Op.getOperand(0); 6831 SDValue N1 = Op.getOperand(1); 6832 SDValue N2 = Op.getOperand(2); 6833 6834 // If this is a 256-bit vector result, first extract the 128-bit vector, 6835 // insert the element into the extracted half and then place it back. 6836 if (VT.getSizeInBits() == 256) { 6837 if (!isa<ConstantSDNode>(N2)) 6838 return SDValue(); 6839 6840 // Get the desired 128-bit vector half. 6841 unsigned NumElems = VT.getVectorNumElements(); 6842 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 6843 bool Upper = IdxVal >= NumElems/2; 6844 SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32); 6845 SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl); 6846 6847 // Insert the element into the desired half. 6848 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, 6849 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2); 6850 6851 // Insert the changed part back to the 256-bit vector 6852 return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); 6853 } 6854 6855 if (Subtarget->hasSSE41()) 6856 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 6857 6858 if (EltVT == MVT::i8) 6859 return SDValue(); 6860 6861 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 6862 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 6863 // as its second argument. 6864 if (N1.getValueType() != MVT::i32) 6865 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6866 if (N2.getValueType() != MVT::i32) 6867 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6868 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 6869 } 6870 return SDValue(); 6871} 6872 6873SDValue 6874X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 6875 LLVMContext *Context = DAG.getContext(); 6876 DebugLoc dl = Op.getDebugLoc(); 6877 EVT OpVT = Op.getValueType(); 6878 6879 // If this is a 256-bit vector result, first insert into a 128-bit 6880 // vector and then insert into the 256-bit vector. 6881 if (OpVT.getSizeInBits() > 128) { 6882 // Insert into a 128-bit vector. 6883 EVT VT128 = EVT::getVectorVT(*Context, 6884 OpVT.getVectorElementType(), 6885 OpVT.getVectorNumElements() / 2); 6886 6887 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 6888 6889 // Insert the 128-bit vector. 6890 return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, 6891 DAG.getConstant(0, MVT::i32), 6892 DAG, dl); 6893 } 6894 6895 if (Op.getValueType() == MVT::v1i64 && 6896 Op.getOperand(0).getValueType() == MVT::i64) 6897 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 6898 6899 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 6900 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 6901 "Expected an SSE type!"); 6902 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 6903 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 6904} 6905 6906// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 6907// a simple subregister reference or explicit instructions to grab 6908// upper bits of a vector. 6909SDValue 6910X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6911 if (Subtarget->hasAVX()) { 6912 DebugLoc dl = Op.getNode()->getDebugLoc(); 6913 SDValue Vec = Op.getNode()->getOperand(0); 6914 SDValue Idx = Op.getNode()->getOperand(1); 6915 6916 if (Op.getNode()->getValueType(0).getSizeInBits() == 128 6917 && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { 6918 return Extract128BitVector(Vec, Idx, DAG, dl); 6919 } 6920 } 6921 return SDValue(); 6922} 6923 6924// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 6925// simple superregister reference or explicit instructions to insert 6926// the upper bits of a vector. 6927SDValue 6928X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6929 if (Subtarget->hasAVX()) { 6930 DebugLoc dl = Op.getNode()->getDebugLoc(); 6931 SDValue Vec = Op.getNode()->getOperand(0); 6932 SDValue SubVec = Op.getNode()->getOperand(1); 6933 SDValue Idx = Op.getNode()->getOperand(2); 6934 6935 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 6936 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 6937 return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); 6938 } 6939 } 6940 return SDValue(); 6941} 6942 6943// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 6944// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 6945// one of the above mentioned nodes. It has to be wrapped because otherwise 6946// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 6947// be used to form addressing mode. These wrapped nodes will be selected 6948// into MOV32ri. 6949SDValue 6950X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 6951 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 6952 6953 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6954 // global base reg. 6955 unsigned char OpFlag = 0; 6956 unsigned WrapperKind = X86ISD::Wrapper; 6957 CodeModel::Model M = getTargetMachine().getCodeModel(); 6958 6959 if (Subtarget->isPICStyleRIPRel() && 6960 (M == CodeModel::Small || M == CodeModel::Kernel)) 6961 WrapperKind = X86ISD::WrapperRIP; 6962 else if (Subtarget->isPICStyleGOT()) 6963 OpFlag = X86II::MO_GOTOFF; 6964 else if (Subtarget->isPICStyleStubPIC()) 6965 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6966 6967 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 6968 CP->getAlignment(), 6969 CP->getOffset(), OpFlag); 6970 DebugLoc DL = CP->getDebugLoc(); 6971 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6972 // With PIC, the address is actually $g + Offset. 6973 if (OpFlag) { 6974 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6975 DAG.getNode(X86ISD::GlobalBaseReg, 6976 DebugLoc(), getPointerTy()), 6977 Result); 6978 } 6979 6980 return Result; 6981} 6982 6983SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 6984 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 6985 6986 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6987 // global base reg. 6988 unsigned char OpFlag = 0; 6989 unsigned WrapperKind = X86ISD::Wrapper; 6990 CodeModel::Model M = getTargetMachine().getCodeModel(); 6991 6992 if (Subtarget->isPICStyleRIPRel() && 6993 (M == CodeModel::Small || M == CodeModel::Kernel)) 6994 WrapperKind = X86ISD::WrapperRIP; 6995 else if (Subtarget->isPICStyleGOT()) 6996 OpFlag = X86II::MO_GOTOFF; 6997 else if (Subtarget->isPICStyleStubPIC()) 6998 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6999 7000 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 7001 OpFlag); 7002 DebugLoc DL = JT->getDebugLoc(); 7003 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7004 7005 // With PIC, the address is actually $g + Offset. 7006 if (OpFlag) 7007 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7008 DAG.getNode(X86ISD::GlobalBaseReg, 7009 DebugLoc(), getPointerTy()), 7010 Result); 7011 7012 return Result; 7013} 7014 7015SDValue 7016X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 7017 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 7018 7019 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7020 // global base reg. 7021 unsigned char OpFlag = 0; 7022 unsigned WrapperKind = X86ISD::Wrapper; 7023 CodeModel::Model M = getTargetMachine().getCodeModel(); 7024 7025 if (Subtarget->isPICStyleRIPRel() && 7026 (M == CodeModel::Small || M == CodeModel::Kernel)) { 7027 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 7028 OpFlag = X86II::MO_GOTPCREL; 7029 WrapperKind = X86ISD::WrapperRIP; 7030 } else if (Subtarget->isPICStyleGOT()) { 7031 OpFlag = X86II::MO_GOT; 7032 } else if (Subtarget->isPICStyleStubPIC()) { 7033 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 7034 } else if (Subtarget->isPICStyleStubNoDynamic()) { 7035 OpFlag = X86II::MO_DARWIN_NONLAZY; 7036 } 7037 7038 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 7039 7040 DebugLoc DL = Op.getDebugLoc(); 7041 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7042 7043 7044 // With PIC, the address is actually $g + Offset. 7045 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 7046 !Subtarget->is64Bit()) { 7047 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7048 DAG.getNode(X86ISD::GlobalBaseReg, 7049 DebugLoc(), getPointerTy()), 7050 Result); 7051 } 7052 7053 // For symbols that require a load from a stub to get the address, emit the 7054 // load. 7055 if (isGlobalStubReference(OpFlag)) 7056 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 7057 MachinePointerInfo::getGOT(), false, false, false, 0); 7058 7059 return Result; 7060} 7061 7062SDValue 7063X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 7064 // Create the TargetBlockAddressAddress node. 7065 unsigned char OpFlags = 7066 Subtarget->ClassifyBlockAddressReference(); 7067 CodeModel::Model M = getTargetMachine().getCodeModel(); 7068 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 7069 DebugLoc dl = Op.getDebugLoc(); 7070 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 7071 /*isTarget=*/true, OpFlags); 7072 7073 if (Subtarget->isPICStyleRIPRel() && 7074 (M == CodeModel::Small || M == CodeModel::Kernel)) 7075 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7076 else 7077 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7078 7079 // With PIC, the address is actually $g + Offset. 7080 if (isGlobalRelativeToPICBase(OpFlags)) { 7081 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7082 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7083 Result); 7084 } 7085 7086 return Result; 7087} 7088 7089SDValue 7090X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 7091 int64_t Offset, 7092 SelectionDAG &DAG) const { 7093 // Create the TargetGlobalAddress node, folding in the constant 7094 // offset if it is legal. 7095 unsigned char OpFlags = 7096 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 7097 CodeModel::Model M = getTargetMachine().getCodeModel(); 7098 SDValue Result; 7099 if (OpFlags == X86II::MO_NO_FLAG && 7100 X86::isOffsetSuitableForCodeModel(Offset, M)) { 7101 // A direct static reference to a global. 7102 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 7103 Offset = 0; 7104 } else { 7105 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 7106 } 7107 7108 if (Subtarget->isPICStyleRIPRel() && 7109 (M == CodeModel::Small || M == CodeModel::Kernel)) 7110 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7111 else 7112 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7113 7114 // With PIC, the address is actually $g + Offset. 7115 if (isGlobalRelativeToPICBase(OpFlags)) { 7116 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7117 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7118 Result); 7119 } 7120 7121 // For globals that require a load from a stub to get the address, emit the 7122 // load. 7123 if (isGlobalStubReference(OpFlags)) 7124 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 7125 MachinePointerInfo::getGOT(), false, false, false, 0); 7126 7127 // If there was a non-zero offset that we didn't fold, create an explicit 7128 // addition for it. 7129 if (Offset != 0) 7130 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 7131 DAG.getConstant(Offset, getPointerTy())); 7132 7133 return Result; 7134} 7135 7136SDValue 7137X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 7138 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 7139 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 7140 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 7141} 7142 7143static SDValue 7144GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 7145 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 7146 unsigned char OperandFlags) { 7147 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7148 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7149 DebugLoc dl = GA->getDebugLoc(); 7150 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7151 GA->getValueType(0), 7152 GA->getOffset(), 7153 OperandFlags); 7154 if (InFlag) { 7155 SDValue Ops[] = { Chain, TGA, *InFlag }; 7156 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 7157 } else { 7158 SDValue Ops[] = { Chain, TGA }; 7159 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 7160 } 7161 7162 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 7163 MFI->setAdjustsStack(true); 7164 7165 SDValue Flag = Chain.getValue(1); 7166 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 7167} 7168 7169// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 7170static SDValue 7171LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7172 const EVT PtrVT) { 7173 SDValue InFlag; 7174 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 7175 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 7176 DAG.getNode(X86ISD::GlobalBaseReg, 7177 DebugLoc(), PtrVT), InFlag); 7178 InFlag = Chain.getValue(1); 7179 7180 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 7181} 7182 7183// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 7184static SDValue 7185LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7186 const EVT PtrVT) { 7187 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 7188 X86::RAX, X86II::MO_TLSGD); 7189} 7190 7191// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 7192// "local exec" model. 7193static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7194 const EVT PtrVT, TLSModel::Model model, 7195 bool is64Bit) { 7196 DebugLoc dl = GA->getDebugLoc(); 7197 7198 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 7199 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 7200 is64Bit ? 257 : 256)); 7201 7202 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 7203 DAG.getIntPtrConstant(0), 7204 MachinePointerInfo(Ptr), 7205 false, false, false, 0); 7206 7207 unsigned char OperandFlags = 0; 7208 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 7209 // initialexec. 7210 unsigned WrapperKind = X86ISD::Wrapper; 7211 if (model == TLSModel::LocalExec) { 7212 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 7213 } else if (is64Bit) { 7214 assert(model == TLSModel::InitialExec); 7215 OperandFlags = X86II::MO_GOTTPOFF; 7216 WrapperKind = X86ISD::WrapperRIP; 7217 } else { 7218 assert(model == TLSModel::InitialExec); 7219 OperandFlags = X86II::MO_INDNTPOFF; 7220 } 7221 7222 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 7223 // exec) 7224 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7225 GA->getValueType(0), 7226 GA->getOffset(), OperandFlags); 7227 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 7228 7229 if (model == TLSModel::InitialExec) 7230 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 7231 MachinePointerInfo::getGOT(), false, false, false, 0); 7232 7233 // The address of the thread local variable is the add of the thread 7234 // pointer with the offset of the variable. 7235 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 7236} 7237 7238SDValue 7239X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 7240 7241 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 7242 const GlobalValue *GV = GA->getGlobal(); 7243 7244 if (Subtarget->isTargetELF()) { 7245 // TODO: implement the "local dynamic" model 7246 // TODO: implement the "initial exec"model for pic executables 7247 7248 // If GV is an alias then use the aliasee for determining 7249 // thread-localness. 7250 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 7251 GV = GA->resolveAliasedGlobal(false); 7252 7253 TLSModel::Model model 7254 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 7255 7256 switch (model) { 7257 case TLSModel::GeneralDynamic: 7258 case TLSModel::LocalDynamic: // not implemented 7259 if (Subtarget->is64Bit()) 7260 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 7261 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 7262 7263 case TLSModel::InitialExec: 7264 case TLSModel::LocalExec: 7265 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 7266 Subtarget->is64Bit()); 7267 } 7268 } else if (Subtarget->isTargetDarwin()) { 7269 // Darwin only has one model of TLS. Lower to that. 7270 unsigned char OpFlag = 0; 7271 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 7272 X86ISD::WrapperRIP : X86ISD::Wrapper; 7273 7274 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7275 // global base reg. 7276 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 7277 !Subtarget->is64Bit(); 7278 if (PIC32) 7279 OpFlag = X86II::MO_TLVP_PIC_BASE; 7280 else 7281 OpFlag = X86II::MO_TLVP; 7282 DebugLoc DL = Op.getDebugLoc(); 7283 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 7284 GA->getValueType(0), 7285 GA->getOffset(), OpFlag); 7286 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7287 7288 // With PIC32, the address is actually $g + Offset. 7289 if (PIC32) 7290 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7291 DAG.getNode(X86ISD::GlobalBaseReg, 7292 DebugLoc(), getPointerTy()), 7293 Offset); 7294 7295 // Lowering the machine isd will make sure everything is in the right 7296 // location. 7297 SDValue Chain = DAG.getEntryNode(); 7298 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7299 SDValue Args[] = { Chain, Offset }; 7300 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 7301 7302 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 7303 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7304 MFI->setAdjustsStack(true); 7305 7306 // And our return value (tls address) is in the standard call return value 7307 // location. 7308 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 7309 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), 7310 Chain.getValue(1)); 7311 } else if (Subtarget->isTargetWindows()) { 7312 // Just use the implicit TLS architecture 7313 // Need to generate someting similar to: 7314 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 7315 // ; from TEB 7316 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 7317 // mov rcx, qword [rdx+rcx*8] 7318 // mov eax, .tls$:tlsvar 7319 // [rax+rcx] contains the address 7320 // Windows 64bit: gs:0x58 7321 // Windows 32bit: fs:__tls_array 7322 7323 // If GV is an alias then use the aliasee for determining 7324 // thread-localness. 7325 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 7326 GV = GA->resolveAliasedGlobal(false); 7327 DebugLoc dl = GA->getDebugLoc(); 7328 SDValue Chain = DAG.getEntryNode(); 7329 7330 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 7331 // %gs:0x58 (64-bit). 7332 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() 7333 ? Type::getInt8PtrTy(*DAG.getContext(), 7334 256) 7335 : Type::getInt32PtrTy(*DAG.getContext(), 7336 257)); 7337 7338 SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, 7339 Subtarget->is64Bit() 7340 ? DAG.getIntPtrConstant(0x58) 7341 : DAG.getExternalSymbol("_tls_array", 7342 getPointerTy()), 7343 MachinePointerInfo(Ptr), 7344 false, false, false, 0); 7345 7346 // Load the _tls_index variable 7347 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); 7348 if (Subtarget->is64Bit()) 7349 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, 7350 IDX, MachinePointerInfo(), MVT::i32, 7351 false, false, 0); 7352 else 7353 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), 7354 false, false, false, 0); 7355 7356 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), 7357 getPointerTy()); 7358 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); 7359 7360 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); 7361 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), 7362 false, false, false, 0); 7363 7364 // Get the offset of start of .tls section 7365 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7366 GA->getValueType(0), 7367 GA->getOffset(), X86II::MO_SECREL); 7368 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); 7369 7370 // The address of the thread local variable is the add of the thread 7371 // pointer with the offset of the variable. 7372 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); 7373 } 7374 7375 llvm_unreachable("TLS not implemented for this target."); 7376} 7377 7378 7379/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values 7380/// and take a 2 x i32 value to shift plus a shift amount. 7381SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ 7382 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 7383 EVT VT = Op.getValueType(); 7384 unsigned VTBits = VT.getSizeInBits(); 7385 DebugLoc dl = Op.getDebugLoc(); 7386 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 7387 SDValue ShOpLo = Op.getOperand(0); 7388 SDValue ShOpHi = Op.getOperand(1); 7389 SDValue ShAmt = Op.getOperand(2); 7390 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 7391 DAG.getConstant(VTBits - 1, MVT::i8)) 7392 : DAG.getConstant(0, VT); 7393 7394 SDValue Tmp2, Tmp3; 7395 if (Op.getOpcode() == ISD::SHL_PARTS) { 7396 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 7397 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 7398 } else { 7399 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 7400 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 7401 } 7402 7403 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 7404 DAG.getConstant(VTBits, MVT::i8)); 7405 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 7406 AndNode, DAG.getConstant(0, MVT::i8)); 7407 7408 SDValue Hi, Lo; 7409 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7410 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 7411 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 7412 7413 if (Op.getOpcode() == ISD::SHL_PARTS) { 7414 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7415 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7416 } else { 7417 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7418 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7419 } 7420 7421 SDValue Ops[2] = { Lo, Hi }; 7422 return DAG.getMergeValues(Ops, 2, dl); 7423} 7424 7425SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 7426 SelectionDAG &DAG) const { 7427 EVT SrcVT = Op.getOperand(0).getValueType(); 7428 7429 if (SrcVT.isVector()) 7430 return SDValue(); 7431 7432 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 7433 "Unknown SINT_TO_FP to lower!"); 7434 7435 // These are really Legal; return the operand so the caller accepts it as 7436 // Legal. 7437 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 7438 return Op; 7439 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 7440 Subtarget->is64Bit()) { 7441 return Op; 7442 } 7443 7444 DebugLoc dl = Op.getDebugLoc(); 7445 unsigned Size = SrcVT.getSizeInBits()/8; 7446 MachineFunction &MF = DAG.getMachineFunction(); 7447 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 7448 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7449 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7450 StackSlot, 7451 MachinePointerInfo::getFixedStack(SSFI), 7452 false, false, 0); 7453 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 7454} 7455 7456SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 7457 SDValue StackSlot, 7458 SelectionDAG &DAG) const { 7459 // Build the FILD 7460 DebugLoc DL = Op.getDebugLoc(); 7461 SDVTList Tys; 7462 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 7463 if (useSSE) 7464 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 7465 else 7466 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 7467 7468 unsigned ByteSize = SrcVT.getSizeInBits()/8; 7469 7470 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 7471 MachineMemOperand *MMO; 7472 if (FI) { 7473 int SSFI = FI->getIndex(); 7474 MMO = 7475 DAG.getMachineFunction() 7476 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7477 MachineMemOperand::MOLoad, ByteSize, ByteSize); 7478 } else { 7479 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 7480 StackSlot = StackSlot.getOperand(1); 7481 } 7482 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 7483 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 7484 X86ISD::FILD, DL, 7485 Tys, Ops, array_lengthof(Ops), 7486 SrcVT, MMO); 7487 7488 if (useSSE) { 7489 Chain = Result.getValue(1); 7490 SDValue InFlag = Result.getValue(2); 7491 7492 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 7493 // shouldn't be necessary except that RFP cannot be live across 7494 // multiple blocks. When stackifier is fixed, they can be uncoupled. 7495 MachineFunction &MF = DAG.getMachineFunction(); 7496 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 7497 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 7498 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7499 Tys = DAG.getVTList(MVT::Other); 7500 SDValue Ops[] = { 7501 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 7502 }; 7503 MachineMemOperand *MMO = 7504 DAG.getMachineFunction() 7505 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7506 MachineMemOperand::MOStore, SSFISize, SSFISize); 7507 7508 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 7509 Ops, array_lengthof(Ops), 7510 Op.getValueType(), MMO); 7511 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 7512 MachinePointerInfo::getFixedStack(SSFI), 7513 false, false, false, 0); 7514 } 7515 7516 return Result; 7517} 7518 7519// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 7520SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 7521 SelectionDAG &DAG) const { 7522 // This algorithm is not obvious. Here it is what we're trying to output: 7523 /* 7524 movq %rax, %xmm0 7525 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 7526 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 7527 #ifdef __SSE3__ 7528 haddpd %xmm0, %xmm0 7529 #else 7530 pshufd $0x4e, %xmm0, %xmm1 7531 addpd %xmm1, %xmm0 7532 #endif 7533 */ 7534 7535 DebugLoc dl = Op.getDebugLoc(); 7536 LLVMContext *Context = DAG.getContext(); 7537 7538 // Build some magic constants. 7539 const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 7540 Constant *C0 = ConstantDataVector::get(*Context, CV0); 7541 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 7542 7543 SmallVector<Constant*,2> CV1; 7544 CV1.push_back( 7545 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 7546 CV1.push_back( 7547 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 7548 Constant *C1 = ConstantVector::get(CV1); 7549 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 7550 7551 // Load the 64-bit value into an XMM register. 7552 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 7553 Op.getOperand(0)); 7554 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 7555 MachinePointerInfo::getConstantPool(), 7556 false, false, false, 16); 7557 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, 7558 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), 7559 CLod0); 7560 7561 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 7562 MachinePointerInfo::getConstantPool(), 7563 false, false, false, 16); 7564 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); 7565 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 7566 SDValue Result; 7567 7568 if (Subtarget->hasSSE3()) { 7569 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 7570 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 7571 } else { 7572 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); 7573 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 7574 S2F, 0x4E, DAG); 7575 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 7576 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), 7577 Sub); 7578 } 7579 7580 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 7581 DAG.getIntPtrConstant(0)); 7582} 7583 7584// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 7585SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 7586 SelectionDAG &DAG) const { 7587 DebugLoc dl = Op.getDebugLoc(); 7588 // FP constant to bias correct the final result. 7589 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 7590 MVT::f64); 7591 7592 // Load the 32-bit value into an XMM register. 7593 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7594 Op.getOperand(0)); 7595 7596 // Zero out the upper parts of the register. 7597 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 7598 7599 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7600 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 7601 DAG.getIntPtrConstant(0)); 7602 7603 // Or the load with the bias. 7604 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 7605 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7606 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7607 MVT::v2f64, Load)), 7608 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7609 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7610 MVT::v2f64, Bias))); 7611 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7612 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 7613 DAG.getIntPtrConstant(0)); 7614 7615 // Subtract the bias. 7616 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 7617 7618 // Handle final rounding. 7619 EVT DestVT = Op.getValueType(); 7620 7621 if (DestVT.bitsLT(MVT::f64)) { 7622 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 7623 DAG.getIntPtrConstant(0)); 7624 } else if (DestVT.bitsGT(MVT::f64)) { 7625 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 7626 } 7627 7628 // Handle final rounding. 7629 return Sub; 7630} 7631 7632SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 7633 SelectionDAG &DAG) const { 7634 SDValue N0 = Op.getOperand(0); 7635 DebugLoc dl = Op.getDebugLoc(); 7636 7637 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 7638 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 7639 // the optimization here. 7640 if (DAG.SignBitIsZero(N0)) 7641 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 7642 7643 EVT SrcVT = N0.getValueType(); 7644 EVT DstVT = Op.getValueType(); 7645 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 7646 return LowerUINT_TO_FP_i64(Op, DAG); 7647 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 7648 return LowerUINT_TO_FP_i32(Op, DAG); 7649 else if (Subtarget->is64Bit() && 7650 SrcVT == MVT::i64 && DstVT == MVT::f32) 7651 return SDValue(); 7652 7653 // Make a 64-bit buffer, and use it to build an FILD. 7654 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 7655 if (SrcVT == MVT::i32) { 7656 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 7657 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 7658 getPointerTy(), StackSlot, WordOff); 7659 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7660 StackSlot, MachinePointerInfo(), 7661 false, false, 0); 7662 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 7663 OffsetSlot, MachinePointerInfo(), 7664 false, false, 0); 7665 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 7666 return Fild; 7667 } 7668 7669 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 7670 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7671 StackSlot, MachinePointerInfo(), 7672 false, false, 0); 7673 // For i64 source, we need to add the appropriate power of 2 if the input 7674 // was negative. This is the same as the optimization in 7675 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 7676 // we must be careful to do the computation in x87 extended precision, not 7677 // in SSE. (The generic code can't know it's OK to do this, or how to.) 7678 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 7679 MachineMemOperand *MMO = 7680 DAG.getMachineFunction() 7681 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7682 MachineMemOperand::MOLoad, 8, 8); 7683 7684 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 7685 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 7686 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 7687 MVT::i64, MMO); 7688 7689 APInt FF(32, 0x5F800000ULL); 7690 7691 // Check whether the sign bit is set. 7692 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 7693 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 7694 ISD::SETLT); 7695 7696 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 7697 SDValue FudgePtr = DAG.getConstantPool( 7698 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 7699 getPointerTy()); 7700 7701 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 7702 SDValue Zero = DAG.getIntPtrConstant(0); 7703 SDValue Four = DAG.getIntPtrConstant(4); 7704 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 7705 Zero, Four); 7706 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 7707 7708 // Load the value out, extending it from f32 to f80. 7709 // FIXME: Avoid the extend by constructing the right constant pool? 7710 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 7711 FudgePtr, MachinePointerInfo::getConstantPool(), 7712 MVT::f32, false, false, 4); 7713 // Extend everything to 80 bits to force it to be done on x87. 7714 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 7715 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 7716} 7717 7718std::pair<SDValue,SDValue> X86TargetLowering:: 7719FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { 7720 DebugLoc DL = Op.getDebugLoc(); 7721 7722 EVT DstTy = Op.getValueType(); 7723 7724 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { 7725 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 7726 DstTy = MVT::i64; 7727 } 7728 7729 assert(DstTy.getSimpleVT() <= MVT::i64 && 7730 DstTy.getSimpleVT() >= MVT::i16 && 7731 "Unknown FP_TO_INT to lower!"); 7732 7733 // These are really Legal. 7734 if (DstTy == MVT::i32 && 7735 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7736 return std::make_pair(SDValue(), SDValue()); 7737 if (Subtarget->is64Bit() && 7738 DstTy == MVT::i64 && 7739 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7740 return std::make_pair(SDValue(), SDValue()); 7741 7742 // We lower FP->int64 either into FISTP64 followed by a load from a temporary 7743 // stack slot, or into the FTOL runtime function. 7744 MachineFunction &MF = DAG.getMachineFunction(); 7745 unsigned MemSize = DstTy.getSizeInBits()/8; 7746 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7747 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7748 7749 unsigned Opc; 7750 if (!IsSigned && isIntegerTypeFTOL(DstTy)) 7751 Opc = X86ISD::WIN_FTOL; 7752 else 7753 switch (DstTy.getSimpleVT().SimpleTy) { 7754 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 7755 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 7756 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 7757 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 7758 } 7759 7760 SDValue Chain = DAG.getEntryNode(); 7761 SDValue Value = Op.getOperand(0); 7762 EVT TheVT = Op.getOperand(0).getValueType(); 7763 // FIXME This causes a redundant load/store if the SSE-class value is already 7764 // in memory, such as if it is on the callstack. 7765 if (isScalarFPTypeInSSEReg(TheVT)) { 7766 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 7767 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 7768 MachinePointerInfo::getFixedStack(SSFI), 7769 false, false, 0); 7770 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 7771 SDValue Ops[] = { 7772 Chain, StackSlot, DAG.getValueType(TheVT) 7773 }; 7774 7775 MachineMemOperand *MMO = 7776 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7777 MachineMemOperand::MOLoad, MemSize, MemSize); 7778 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 7779 DstTy, MMO); 7780 Chain = Value.getValue(1); 7781 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7782 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7783 } 7784 7785 MachineMemOperand *MMO = 7786 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7787 MachineMemOperand::MOStore, MemSize, MemSize); 7788 7789 if (Opc != X86ISD::WIN_FTOL) { 7790 // Build the FP_TO_INT*_IN_MEM 7791 SDValue Ops[] = { Chain, Value, StackSlot }; 7792 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 7793 Ops, 3, DstTy, MMO); 7794 return std::make_pair(FIST, StackSlot); 7795 } else { 7796 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, 7797 DAG.getVTList(MVT::Other, MVT::Glue), 7798 Chain, Value); 7799 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, 7800 MVT::i32, ftol.getValue(1)); 7801 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, 7802 MVT::i32, eax.getValue(2)); 7803 SDValue Ops[] = { eax, edx }; 7804 SDValue pair = IsReplace 7805 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2) 7806 : DAG.getMergeValues(Ops, 2, DL); 7807 return std::make_pair(pair, SDValue()); 7808 } 7809} 7810 7811SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 7812 SelectionDAG &DAG) const { 7813 if (Op.getValueType().isVector()) 7814 return SDValue(); 7815 7816 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 7817 /*IsSigned=*/ true, /*IsReplace=*/ false); 7818 SDValue FIST = Vals.first, StackSlot = Vals.second; 7819 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 7820 if (FIST.getNode() == 0) return Op; 7821 7822 if (StackSlot.getNode()) 7823 // Load the result. 7824 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7825 FIST, StackSlot, MachinePointerInfo(), 7826 false, false, false, 0); 7827 else 7828 // The node is the result. 7829 return FIST; 7830} 7831 7832SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 7833 SelectionDAG &DAG) const { 7834 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 7835 /*IsSigned=*/ false, /*IsReplace=*/ false); 7836 SDValue FIST = Vals.first, StackSlot = Vals.second; 7837 assert(FIST.getNode() && "Unexpected failure"); 7838 7839 if (StackSlot.getNode()) 7840 // Load the result. 7841 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7842 FIST, StackSlot, MachinePointerInfo(), 7843 false, false, false, 0); 7844 else 7845 // The node is the result. 7846 return FIST; 7847} 7848 7849SDValue X86TargetLowering::LowerFABS(SDValue Op, 7850 SelectionDAG &DAG) const { 7851 LLVMContext *Context = DAG.getContext(); 7852 DebugLoc dl = Op.getDebugLoc(); 7853 EVT VT = Op.getValueType(); 7854 EVT EltVT = VT; 7855 if (VT.isVector()) 7856 EltVT = VT.getVectorElementType(); 7857 Constant *C; 7858 if (EltVT == MVT::f64) { 7859 C = ConstantVector::getSplat(2, 7860 ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 7861 } else { 7862 C = ConstantVector::getSplat(4, 7863 ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 7864 } 7865 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7866 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7867 MachinePointerInfo::getConstantPool(), 7868 false, false, false, 16); 7869 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 7870} 7871 7872SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 7873 LLVMContext *Context = DAG.getContext(); 7874 DebugLoc dl = Op.getDebugLoc(); 7875 EVT VT = Op.getValueType(); 7876 EVT EltVT = VT; 7877 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 7878 if (VT.isVector()) { 7879 EltVT = VT.getVectorElementType(); 7880 NumElts = VT.getVectorNumElements(); 7881 } 7882 Constant *C; 7883 if (EltVT == MVT::f64) 7884 C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 7885 else 7886 C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 7887 C = ConstantVector::getSplat(NumElts, C); 7888 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7889 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7890 MachinePointerInfo::getConstantPool(), 7891 false, false, false, 16); 7892 if (VT.isVector()) { 7893 MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64; 7894 return DAG.getNode(ISD::BITCAST, dl, VT, 7895 DAG.getNode(ISD::XOR, dl, XORVT, 7896 DAG.getNode(ISD::BITCAST, dl, XORVT, 7897 Op.getOperand(0)), 7898 DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); 7899 } else { 7900 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 7901 } 7902} 7903 7904SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 7905 LLVMContext *Context = DAG.getContext(); 7906 SDValue Op0 = Op.getOperand(0); 7907 SDValue Op1 = Op.getOperand(1); 7908 DebugLoc dl = Op.getDebugLoc(); 7909 EVT VT = Op.getValueType(); 7910 EVT SrcVT = Op1.getValueType(); 7911 7912 // If second operand is smaller, extend it first. 7913 if (SrcVT.bitsLT(VT)) { 7914 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 7915 SrcVT = VT; 7916 } 7917 // And if it is bigger, shrink it first. 7918 if (SrcVT.bitsGT(VT)) { 7919 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 7920 SrcVT = VT; 7921 } 7922 7923 // At this point the operands and the result should have the same 7924 // type, and that won't be f80 since that is not custom lowered. 7925 7926 // First get the sign bit of second operand. 7927 SmallVector<Constant*,4> CV; 7928 if (SrcVT == MVT::f64) { 7929 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 7930 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7931 } else { 7932 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 7933 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7934 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7935 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7936 } 7937 Constant *C = ConstantVector::get(CV); 7938 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7939 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 7940 MachinePointerInfo::getConstantPool(), 7941 false, false, false, 16); 7942 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 7943 7944 // Shift sign bit right or left if the two operands have different types. 7945 if (SrcVT.bitsGT(VT)) { 7946 // Op0 is MVT::f32, Op1 is MVT::f64. 7947 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 7948 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 7949 DAG.getConstant(32, MVT::i32)); 7950 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 7951 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 7952 DAG.getIntPtrConstant(0)); 7953 } 7954 7955 // Clear first operand sign bit. 7956 CV.clear(); 7957 if (VT == MVT::f64) { 7958 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 7959 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7960 } else { 7961 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 7962 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7963 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7964 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7965 } 7966 C = ConstantVector::get(CV); 7967 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7968 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7969 MachinePointerInfo::getConstantPool(), 7970 false, false, false, 16); 7971 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 7972 7973 // Or the value with the sign bit. 7974 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 7975} 7976 7977SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { 7978 SDValue N0 = Op.getOperand(0); 7979 DebugLoc dl = Op.getDebugLoc(); 7980 EVT VT = Op.getValueType(); 7981 7982 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 7983 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 7984 DAG.getConstant(1, VT)); 7985 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 7986} 7987 7988/// Emit nodes that will be selected as "test Op0,Op0", or something 7989/// equivalent. 7990SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 7991 SelectionDAG &DAG) const { 7992 DebugLoc dl = Op.getDebugLoc(); 7993 7994 // CF and OF aren't always set the way we want. Determine which 7995 // of these we need. 7996 bool NeedCF = false; 7997 bool NeedOF = false; 7998 switch (X86CC) { 7999 default: break; 8000 case X86::COND_A: case X86::COND_AE: 8001 case X86::COND_B: case X86::COND_BE: 8002 NeedCF = true; 8003 break; 8004 case X86::COND_G: case X86::COND_GE: 8005 case X86::COND_L: case X86::COND_LE: 8006 case X86::COND_O: case X86::COND_NO: 8007 NeedOF = true; 8008 break; 8009 } 8010 8011 // See if we can use the EFLAGS value from the operand instead of 8012 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 8013 // we prove that the arithmetic won't overflow, we can't use OF or CF. 8014 if (Op.getResNo() != 0 || NeedOF || NeedCF) 8015 // Emit a CMP with 0, which is the TEST pattern. 8016 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8017 DAG.getConstant(0, Op.getValueType())); 8018 8019 unsigned Opcode = 0; 8020 unsigned NumOperands = 0; 8021 switch (Op.getNode()->getOpcode()) { 8022 case ISD::ADD: 8023 // Due to an isel shortcoming, be conservative if this add is likely to be 8024 // selected as part of a load-modify-store instruction. When the root node 8025 // in a match is a store, isel doesn't know how to remap non-chain non-flag 8026 // uses of other nodes in the match, such as the ADD in this case. This 8027 // leads to the ADD being left around and reselected, with the result being 8028 // two adds in the output. Alas, even if none our users are stores, that 8029 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 8030 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 8031 // climbing the DAG back to the root, and it doesn't seem to be worth the 8032 // effort. 8033 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8034 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8035 if (UI->getOpcode() != ISD::CopyToReg && 8036 UI->getOpcode() != ISD::SETCC && 8037 UI->getOpcode() != ISD::STORE) 8038 goto default_case; 8039 8040 if (ConstantSDNode *C = 8041 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 8042 // An add of one will be selected as an INC. 8043 if (C->getAPIntValue() == 1) { 8044 Opcode = X86ISD::INC; 8045 NumOperands = 1; 8046 break; 8047 } 8048 8049 // An add of negative one (subtract of one) will be selected as a DEC. 8050 if (C->getAPIntValue().isAllOnesValue()) { 8051 Opcode = X86ISD::DEC; 8052 NumOperands = 1; 8053 break; 8054 } 8055 } 8056 8057 // Otherwise use a regular EFLAGS-setting add. 8058 Opcode = X86ISD::ADD; 8059 NumOperands = 2; 8060 break; 8061 case ISD::AND: { 8062 // If the primary and result isn't used, don't bother using X86ISD::AND, 8063 // because a TEST instruction will be better. 8064 bool NonFlagUse = false; 8065 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8066 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 8067 SDNode *User = *UI; 8068 unsigned UOpNo = UI.getOperandNo(); 8069 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 8070 // Look pass truncate. 8071 UOpNo = User->use_begin().getOperandNo(); 8072 User = *User->use_begin(); 8073 } 8074 8075 if (User->getOpcode() != ISD::BRCOND && 8076 User->getOpcode() != ISD::SETCC && 8077 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 8078 NonFlagUse = true; 8079 break; 8080 } 8081 } 8082 8083 if (!NonFlagUse) 8084 break; 8085 } 8086 // FALL THROUGH 8087 case ISD::SUB: 8088 case ISD::OR: 8089 case ISD::XOR: 8090 // Due to the ISEL shortcoming noted above, be conservative if this op is 8091 // likely to be selected as part of a load-modify-store instruction. 8092 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8093 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8094 if (UI->getOpcode() == ISD::STORE) 8095 goto default_case; 8096 8097 // Otherwise use a regular EFLAGS-setting instruction. 8098 switch (Op.getNode()->getOpcode()) { 8099 default: llvm_unreachable("unexpected operator!"); 8100 case ISD::SUB: Opcode = X86ISD::SUB; break; 8101 case ISD::OR: Opcode = X86ISD::OR; break; 8102 case ISD::XOR: Opcode = X86ISD::XOR; break; 8103 case ISD::AND: Opcode = X86ISD::AND; break; 8104 } 8105 8106 NumOperands = 2; 8107 break; 8108 case X86ISD::ADD: 8109 case X86ISD::SUB: 8110 case X86ISD::INC: 8111 case X86ISD::DEC: 8112 case X86ISD::OR: 8113 case X86ISD::XOR: 8114 case X86ISD::AND: 8115 return SDValue(Op.getNode(), 1); 8116 default: 8117 default_case: 8118 break; 8119 } 8120 8121 if (Opcode == 0) 8122 // Emit a CMP with 0, which is the TEST pattern. 8123 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8124 DAG.getConstant(0, Op.getValueType())); 8125 8126 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 8127 SmallVector<SDValue, 4> Ops; 8128 for (unsigned i = 0; i != NumOperands; ++i) 8129 Ops.push_back(Op.getOperand(i)); 8130 8131 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 8132 DAG.ReplaceAllUsesWith(Op, New); 8133 return SDValue(New.getNode(), 1); 8134} 8135 8136/// Emit nodes that will be selected as "cmp Op0,Op1", or something 8137/// equivalent. 8138SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 8139 SelectionDAG &DAG) const { 8140 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 8141 if (C->getAPIntValue() == 0) 8142 return EmitTest(Op0, X86CC, DAG); 8143 8144 DebugLoc dl = Op0.getDebugLoc(); 8145 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 8146} 8147 8148/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 8149/// if it's possible. 8150SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 8151 DebugLoc dl, SelectionDAG &DAG) const { 8152 SDValue Op0 = And.getOperand(0); 8153 SDValue Op1 = And.getOperand(1); 8154 if (Op0.getOpcode() == ISD::TRUNCATE) 8155 Op0 = Op0.getOperand(0); 8156 if (Op1.getOpcode() == ISD::TRUNCATE) 8157 Op1 = Op1.getOperand(0); 8158 8159 SDValue LHS, RHS; 8160 if (Op1.getOpcode() == ISD::SHL) 8161 std::swap(Op0, Op1); 8162 if (Op0.getOpcode() == ISD::SHL) { 8163 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 8164 if (And00C->getZExtValue() == 1) { 8165 // If we looked past a truncate, check that it's only truncating away 8166 // known zeros. 8167 unsigned BitWidth = Op0.getValueSizeInBits(); 8168 unsigned AndBitWidth = And.getValueSizeInBits(); 8169 if (BitWidth > AndBitWidth) { 8170 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 8171 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 8172 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 8173 return SDValue(); 8174 } 8175 LHS = Op1; 8176 RHS = Op0.getOperand(1); 8177 } 8178 } else if (Op1.getOpcode() == ISD::Constant) { 8179 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 8180 uint64_t AndRHSVal = AndRHS->getZExtValue(); 8181 SDValue AndLHS = Op0; 8182 8183 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 8184 LHS = AndLHS.getOperand(0); 8185 RHS = AndLHS.getOperand(1); 8186 } 8187 8188 // Use BT if the immediate can't be encoded in a TEST instruction. 8189 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 8190 LHS = AndLHS; 8191 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); 8192 } 8193 } 8194 8195 if (LHS.getNode()) { 8196 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 8197 // instruction. Since the shift amount is in-range-or-undefined, we know 8198 // that doing a bittest on the i32 value is ok. We extend to i32 because 8199 // the encoding for the i16 version is larger than the i32 version. 8200 // Also promote i16 to i32 for performance / code size reason. 8201 if (LHS.getValueType() == MVT::i8 || 8202 LHS.getValueType() == MVT::i16) 8203 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 8204 8205 // If the operand types disagree, extend the shift amount to match. Since 8206 // BT ignores high bits (like shifts) we can use anyextend. 8207 if (LHS.getValueType() != RHS.getValueType()) 8208 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 8209 8210 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 8211 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 8212 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8213 DAG.getConstant(Cond, MVT::i8), BT); 8214 } 8215 8216 return SDValue(); 8217} 8218 8219SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 8220 8221 if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); 8222 8223 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 8224 SDValue Op0 = Op.getOperand(0); 8225 SDValue Op1 = Op.getOperand(1); 8226 DebugLoc dl = Op.getDebugLoc(); 8227 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 8228 8229 // Optimize to BT if possible. 8230 // Lower (X & (1 << N)) == 0 to BT(X, N). 8231 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 8232 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 8233 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 8234 Op1.getOpcode() == ISD::Constant && 8235 cast<ConstantSDNode>(Op1)->isNullValue() && 8236 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8237 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 8238 if (NewSetCC.getNode()) 8239 return NewSetCC; 8240 } 8241 8242 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 8243 // these. 8244 if (Op1.getOpcode() == ISD::Constant && 8245 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 8246 cast<ConstantSDNode>(Op1)->isNullValue()) && 8247 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8248 8249 // If the input is a setcc, then reuse the input setcc or use a new one with 8250 // the inverted condition. 8251 if (Op0.getOpcode() == X86ISD::SETCC) { 8252 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 8253 bool Invert = (CC == ISD::SETNE) ^ 8254 cast<ConstantSDNode>(Op1)->isNullValue(); 8255 if (!Invert) return Op0; 8256 8257 CCode = X86::GetOppositeBranchCondition(CCode); 8258 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8259 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 8260 } 8261 } 8262 8263 bool isFP = Op1.getValueType().isFloatingPoint(); 8264 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 8265 if (X86CC == X86::COND_INVALID) 8266 return SDValue(); 8267 8268 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 8269 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8270 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 8271} 8272 8273// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 8274// ones, and then concatenate the result back. 8275static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 8276 EVT VT = Op.getValueType(); 8277 8278 assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC && 8279 "Unsupported value type for operation"); 8280 8281 int NumElems = VT.getVectorNumElements(); 8282 DebugLoc dl = Op.getDebugLoc(); 8283 SDValue CC = Op.getOperand(2); 8284 SDValue Idx0 = DAG.getConstant(0, MVT::i32); 8285 SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); 8286 8287 // Extract the LHS vectors 8288 SDValue LHS = Op.getOperand(0); 8289 SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); 8290 SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); 8291 8292 // Extract the RHS vectors 8293 SDValue RHS = Op.getOperand(1); 8294 SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); 8295 SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); 8296 8297 // Issue the operation on the smaller types and concatenate the result back 8298 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 8299 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 8300 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 8301 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 8302 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 8303} 8304 8305 8306SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 8307 SDValue Cond; 8308 SDValue Op0 = Op.getOperand(0); 8309 SDValue Op1 = Op.getOperand(1); 8310 SDValue CC = Op.getOperand(2); 8311 EVT VT = Op.getValueType(); 8312 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 8313 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 8314 DebugLoc dl = Op.getDebugLoc(); 8315 8316 if (isFP) { 8317 unsigned SSECC = 8; 8318 EVT EltVT = Op0.getValueType().getVectorElementType(); 8319 assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT; 8320 8321 bool Swap = false; 8322 8323 // SSE Condition code mapping: 8324 // 0 - EQ 8325 // 1 - LT 8326 // 2 - LE 8327 // 3 - UNORD 8328 // 4 - NEQ 8329 // 5 - NLT 8330 // 6 - NLE 8331 // 7 - ORD 8332 switch (SetCCOpcode) { 8333 default: break; 8334 case ISD::SETOEQ: 8335 case ISD::SETEQ: SSECC = 0; break; 8336 case ISD::SETOGT: 8337 case ISD::SETGT: Swap = true; // Fallthrough 8338 case ISD::SETLT: 8339 case ISD::SETOLT: SSECC = 1; break; 8340 case ISD::SETOGE: 8341 case ISD::SETGE: Swap = true; // Fallthrough 8342 case ISD::SETLE: 8343 case ISD::SETOLE: SSECC = 2; break; 8344 case ISD::SETUO: SSECC = 3; break; 8345 case ISD::SETUNE: 8346 case ISD::SETNE: SSECC = 4; break; 8347 case ISD::SETULE: Swap = true; 8348 case ISD::SETUGE: SSECC = 5; break; 8349 case ISD::SETULT: Swap = true; 8350 case ISD::SETUGT: SSECC = 6; break; 8351 case ISD::SETO: SSECC = 7; break; 8352 } 8353 if (Swap) 8354 std::swap(Op0, Op1); 8355 8356 // In the two special cases we can't handle, emit two comparisons. 8357 if (SSECC == 8) { 8358 if (SetCCOpcode == ISD::SETUEQ) { 8359 SDValue UNORD, EQ; 8360 UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8361 DAG.getConstant(3, MVT::i8)); 8362 EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8363 DAG.getConstant(0, MVT::i8)); 8364 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 8365 } else if (SetCCOpcode == ISD::SETONE) { 8366 SDValue ORD, NEQ; 8367 ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8368 DAG.getConstant(7, MVT::i8)); 8369 NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8370 DAG.getConstant(4, MVT::i8)); 8371 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 8372 } 8373 llvm_unreachable("Illegal FP comparison"); 8374 } 8375 // Handle all other FP comparisons here. 8376 return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, 8377 DAG.getConstant(SSECC, MVT::i8)); 8378 } 8379 8380 // Break 256-bit integer vector compare into smaller ones. 8381 if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) 8382 return Lower256IntVSETCC(Op, DAG); 8383 8384 // We are handling one of the integer comparisons here. Since SSE only has 8385 // GT and EQ comparisons for integer, swapping operands and multiple 8386 // operations may be required for some comparisons. 8387 unsigned Opc = 0; 8388 bool Swap = false, Invert = false, FlipSigns = false; 8389 8390 switch (SetCCOpcode) { 8391 default: break; 8392 case ISD::SETNE: Invert = true; 8393 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; 8394 case ISD::SETLT: Swap = true; 8395 case ISD::SETGT: Opc = X86ISD::PCMPGT; break; 8396 case ISD::SETGE: Swap = true; 8397 case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; 8398 case ISD::SETULT: Swap = true; 8399 case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; 8400 case ISD::SETUGE: Swap = true; 8401 case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; 8402 } 8403 if (Swap) 8404 std::swap(Op0, Op1); 8405 8406 // Check that the operation in question is available (most are plain SSE2, 8407 // but PCMPGTQ and PCMPEQQ have different requirements). 8408 if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42()) 8409 return SDValue(); 8410 if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41()) 8411 return SDValue(); 8412 8413 // Since SSE has no unsigned integer comparisons, we need to flip the sign 8414 // bits of the inputs before performing those operations. 8415 if (FlipSigns) { 8416 EVT EltVT = VT.getVectorElementType(); 8417 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 8418 EltVT); 8419 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 8420 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 8421 SignBits.size()); 8422 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 8423 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 8424 } 8425 8426 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 8427 8428 // If the logical-not of the result is required, perform that now. 8429 if (Invert) 8430 Result = DAG.getNOT(dl, Result, VT); 8431 8432 return Result; 8433} 8434 8435// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 8436static bool isX86LogicalCmp(SDValue Op) { 8437 unsigned Opc = Op.getNode()->getOpcode(); 8438 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 8439 return true; 8440 if (Op.getResNo() == 1 && 8441 (Opc == X86ISD::ADD || 8442 Opc == X86ISD::SUB || 8443 Opc == X86ISD::ADC || 8444 Opc == X86ISD::SBB || 8445 Opc == X86ISD::SMUL || 8446 Opc == X86ISD::UMUL || 8447 Opc == X86ISD::INC || 8448 Opc == X86ISD::DEC || 8449 Opc == X86ISD::OR || 8450 Opc == X86ISD::XOR || 8451 Opc == X86ISD::AND)) 8452 return true; 8453 8454 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 8455 return true; 8456 8457 return false; 8458} 8459 8460static bool isZero(SDValue V) { 8461 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8462 return C && C->isNullValue(); 8463} 8464 8465static bool isAllOnes(SDValue V) { 8466 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8467 return C && C->isAllOnesValue(); 8468} 8469 8470SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 8471 bool addTest = true; 8472 SDValue Cond = Op.getOperand(0); 8473 SDValue Op1 = Op.getOperand(1); 8474 SDValue Op2 = Op.getOperand(2); 8475 DebugLoc DL = Op.getDebugLoc(); 8476 SDValue CC; 8477 8478 if (Cond.getOpcode() == ISD::SETCC) { 8479 SDValue NewCond = LowerSETCC(Cond, DAG); 8480 if (NewCond.getNode()) 8481 Cond = NewCond; 8482 } 8483 8484 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 8485 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 8486 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 8487 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 8488 if (Cond.getOpcode() == X86ISD::SETCC && 8489 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 8490 isZero(Cond.getOperand(1).getOperand(1))) { 8491 SDValue Cmp = Cond.getOperand(1); 8492 8493 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 8494 8495 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 8496 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 8497 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 8498 8499 SDValue CmpOp0 = Cmp.getOperand(0); 8500 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 8501 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 8502 8503 SDValue Res = // Res = 0 or -1. 8504 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8505 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 8506 8507 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 8508 Res = DAG.getNOT(DL, Res, Res.getValueType()); 8509 8510 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 8511 if (N2C == 0 || !N2C->isNullValue()) 8512 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 8513 return Res; 8514 } 8515 } 8516 8517 // Look past (and (setcc_carry (cmp ...)), 1). 8518 if (Cond.getOpcode() == ISD::AND && 8519 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8520 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8521 if (C && C->getAPIntValue() == 1) 8522 Cond = Cond.getOperand(0); 8523 } 8524 8525 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8526 // setting operand in place of the X86ISD::SETCC. 8527 unsigned CondOpcode = Cond.getOpcode(); 8528 if (CondOpcode == X86ISD::SETCC || 8529 CondOpcode == X86ISD::SETCC_CARRY) { 8530 CC = Cond.getOperand(0); 8531 8532 SDValue Cmp = Cond.getOperand(1); 8533 unsigned Opc = Cmp.getOpcode(); 8534 EVT VT = Op.getValueType(); 8535 8536 bool IllegalFPCMov = false; 8537 if (VT.isFloatingPoint() && !VT.isVector() && 8538 !isScalarFPTypeInSSEReg(VT)) // FPStack? 8539 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 8540 8541 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 8542 Opc == X86ISD::BT) { // FIXME 8543 Cond = Cmp; 8544 addTest = false; 8545 } 8546 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 8547 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 8548 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 8549 Cond.getOperand(0).getValueType() != MVT::i8)) { 8550 SDValue LHS = Cond.getOperand(0); 8551 SDValue RHS = Cond.getOperand(1); 8552 unsigned X86Opcode; 8553 unsigned X86Cond; 8554 SDVTList VTs; 8555 switch (CondOpcode) { 8556 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 8557 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 8558 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 8559 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 8560 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 8561 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 8562 default: llvm_unreachable("unexpected overflowing operator"); 8563 } 8564 if (CondOpcode == ISD::UMULO) 8565 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 8566 MVT::i32); 8567 else 8568 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 8569 8570 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 8571 8572 if (CondOpcode == ISD::UMULO) 8573 Cond = X86Op.getValue(2); 8574 else 8575 Cond = X86Op.getValue(1); 8576 8577 CC = DAG.getConstant(X86Cond, MVT::i8); 8578 addTest = false; 8579 } 8580 8581 if (addTest) { 8582 // Look pass the truncate. 8583 if (Cond.getOpcode() == ISD::TRUNCATE) 8584 Cond = Cond.getOperand(0); 8585 8586 // We know the result of AND is compared against zero. Try to match 8587 // it to BT. 8588 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8589 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 8590 if (NewSetCC.getNode()) { 8591 CC = NewSetCC.getOperand(0); 8592 Cond = NewSetCC.getOperand(1); 8593 addTest = false; 8594 } 8595 } 8596 } 8597 8598 if (addTest) { 8599 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8600 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8601 } 8602 8603 // a < b ? -1 : 0 -> RES = ~setcc_carry 8604 // a < b ? 0 : -1 -> RES = setcc_carry 8605 // a >= b ? -1 : 0 -> RES = setcc_carry 8606 // a >= b ? 0 : -1 -> RES = ~setcc_carry 8607 if (Cond.getOpcode() == X86ISD::CMP) { 8608 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 8609 8610 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 8611 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 8612 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8613 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 8614 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 8615 return DAG.getNOT(DL, Res, Res.getValueType()); 8616 return Res; 8617 } 8618 } 8619 8620 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 8621 // condition is true. 8622 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 8623 SDValue Ops[] = { Op2, Op1, CC, Cond }; 8624 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 8625} 8626 8627// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 8628// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 8629// from the AND / OR. 8630static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 8631 Opc = Op.getOpcode(); 8632 if (Opc != ISD::OR && Opc != ISD::AND) 8633 return false; 8634 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8635 Op.getOperand(0).hasOneUse() && 8636 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 8637 Op.getOperand(1).hasOneUse()); 8638} 8639 8640// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 8641// 1 and that the SETCC node has a single use. 8642static bool isXor1OfSetCC(SDValue Op) { 8643 if (Op.getOpcode() != ISD::XOR) 8644 return false; 8645 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8646 if (N1C && N1C->getAPIntValue() == 1) { 8647 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8648 Op.getOperand(0).hasOneUse(); 8649 } 8650 return false; 8651} 8652 8653SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 8654 bool addTest = true; 8655 SDValue Chain = Op.getOperand(0); 8656 SDValue Cond = Op.getOperand(1); 8657 SDValue Dest = Op.getOperand(2); 8658 DebugLoc dl = Op.getDebugLoc(); 8659 SDValue CC; 8660 bool Inverted = false; 8661 8662 if (Cond.getOpcode() == ISD::SETCC) { 8663 // Check for setcc([su]{add,sub,mul}o == 0). 8664 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 8665 isa<ConstantSDNode>(Cond.getOperand(1)) && 8666 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && 8667 Cond.getOperand(0).getResNo() == 1 && 8668 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 8669 Cond.getOperand(0).getOpcode() == ISD::UADDO || 8670 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 8671 Cond.getOperand(0).getOpcode() == ISD::USUBO || 8672 Cond.getOperand(0).getOpcode() == ISD::SMULO || 8673 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 8674 Inverted = true; 8675 Cond = Cond.getOperand(0); 8676 } else { 8677 SDValue NewCond = LowerSETCC(Cond, DAG); 8678 if (NewCond.getNode()) 8679 Cond = NewCond; 8680 } 8681 } 8682#if 0 8683 // FIXME: LowerXALUO doesn't handle these!! 8684 else if (Cond.getOpcode() == X86ISD::ADD || 8685 Cond.getOpcode() == X86ISD::SUB || 8686 Cond.getOpcode() == X86ISD::SMUL || 8687 Cond.getOpcode() == X86ISD::UMUL) 8688 Cond = LowerXALUO(Cond, DAG); 8689#endif 8690 8691 // Look pass (and (setcc_carry (cmp ...)), 1). 8692 if (Cond.getOpcode() == ISD::AND && 8693 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8694 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8695 if (C && C->getAPIntValue() == 1) 8696 Cond = Cond.getOperand(0); 8697 } 8698 8699 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8700 // setting operand in place of the X86ISD::SETCC. 8701 unsigned CondOpcode = Cond.getOpcode(); 8702 if (CondOpcode == X86ISD::SETCC || 8703 CondOpcode == X86ISD::SETCC_CARRY) { 8704 CC = Cond.getOperand(0); 8705 8706 SDValue Cmp = Cond.getOperand(1); 8707 unsigned Opc = Cmp.getOpcode(); 8708 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 8709 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 8710 Cond = Cmp; 8711 addTest = false; 8712 } else { 8713 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 8714 default: break; 8715 case X86::COND_O: 8716 case X86::COND_B: 8717 // These can only come from an arithmetic instruction with overflow, 8718 // e.g. SADDO, UADDO. 8719 Cond = Cond.getNode()->getOperand(1); 8720 addTest = false; 8721 break; 8722 } 8723 } 8724 } 8725 CondOpcode = Cond.getOpcode(); 8726 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 8727 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 8728 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 8729 Cond.getOperand(0).getValueType() != MVT::i8)) { 8730 SDValue LHS = Cond.getOperand(0); 8731 SDValue RHS = Cond.getOperand(1); 8732 unsigned X86Opcode; 8733 unsigned X86Cond; 8734 SDVTList VTs; 8735 switch (CondOpcode) { 8736 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 8737 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 8738 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 8739 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 8740 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 8741 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 8742 default: llvm_unreachable("unexpected overflowing operator"); 8743 } 8744 if (Inverted) 8745 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 8746 if (CondOpcode == ISD::UMULO) 8747 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 8748 MVT::i32); 8749 else 8750 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 8751 8752 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 8753 8754 if (CondOpcode == ISD::UMULO) 8755 Cond = X86Op.getValue(2); 8756 else 8757 Cond = X86Op.getValue(1); 8758 8759 CC = DAG.getConstant(X86Cond, MVT::i8); 8760 addTest = false; 8761 } else { 8762 unsigned CondOpc; 8763 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 8764 SDValue Cmp = Cond.getOperand(0).getOperand(1); 8765 if (CondOpc == ISD::OR) { 8766 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 8767 // two branches instead of an explicit OR instruction with a 8768 // separate test. 8769 if (Cmp == Cond.getOperand(1).getOperand(1) && 8770 isX86LogicalCmp(Cmp)) { 8771 CC = Cond.getOperand(0).getOperand(0); 8772 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8773 Chain, Dest, CC, Cmp); 8774 CC = Cond.getOperand(1).getOperand(0); 8775 Cond = Cmp; 8776 addTest = false; 8777 } 8778 } else { // ISD::AND 8779 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 8780 // two branches instead of an explicit AND instruction with a 8781 // separate test. However, we only do this if this block doesn't 8782 // have a fall-through edge, because this requires an explicit 8783 // jmp when the condition is false. 8784 if (Cmp == Cond.getOperand(1).getOperand(1) && 8785 isX86LogicalCmp(Cmp) && 8786 Op.getNode()->hasOneUse()) { 8787 X86::CondCode CCode = 8788 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8789 CCode = X86::GetOppositeBranchCondition(CCode); 8790 CC = DAG.getConstant(CCode, MVT::i8); 8791 SDNode *User = *Op.getNode()->use_begin(); 8792 // Look for an unconditional branch following this conditional branch. 8793 // We need this because we need to reverse the successors in order 8794 // to implement FCMP_OEQ. 8795 if (User->getOpcode() == ISD::BR) { 8796 SDValue FalseBB = User->getOperand(1); 8797 SDNode *NewBR = 8798 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8799 assert(NewBR == User); 8800 (void)NewBR; 8801 Dest = FalseBB; 8802 8803 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8804 Chain, Dest, CC, Cmp); 8805 X86::CondCode CCode = 8806 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 8807 CCode = X86::GetOppositeBranchCondition(CCode); 8808 CC = DAG.getConstant(CCode, MVT::i8); 8809 Cond = Cmp; 8810 addTest = false; 8811 } 8812 } 8813 } 8814 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 8815 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 8816 // It should be transformed during dag combiner except when the condition 8817 // is set by a arithmetics with overflow node. 8818 X86::CondCode CCode = 8819 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8820 CCode = X86::GetOppositeBranchCondition(CCode); 8821 CC = DAG.getConstant(CCode, MVT::i8); 8822 Cond = Cond.getOperand(0).getOperand(1); 8823 addTest = false; 8824 } else if (Cond.getOpcode() == ISD::SETCC && 8825 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 8826 // For FCMP_OEQ, we can emit 8827 // two branches instead of an explicit AND instruction with a 8828 // separate test. However, we only do this if this block doesn't 8829 // have a fall-through edge, because this requires an explicit 8830 // jmp when the condition is false. 8831 if (Op.getNode()->hasOneUse()) { 8832 SDNode *User = *Op.getNode()->use_begin(); 8833 // Look for an unconditional branch following this conditional branch. 8834 // We need this because we need to reverse the successors in order 8835 // to implement FCMP_OEQ. 8836 if (User->getOpcode() == ISD::BR) { 8837 SDValue FalseBB = User->getOperand(1); 8838 SDNode *NewBR = 8839 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8840 assert(NewBR == User); 8841 (void)NewBR; 8842 Dest = FalseBB; 8843 8844 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 8845 Cond.getOperand(0), Cond.getOperand(1)); 8846 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8847 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8848 Chain, Dest, CC, Cmp); 8849 CC = DAG.getConstant(X86::COND_P, MVT::i8); 8850 Cond = Cmp; 8851 addTest = false; 8852 } 8853 } 8854 } else if (Cond.getOpcode() == ISD::SETCC && 8855 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 8856 // For FCMP_UNE, we can emit 8857 // two branches instead of an explicit AND instruction with a 8858 // separate test. However, we only do this if this block doesn't 8859 // have a fall-through edge, because this requires an explicit 8860 // jmp when the condition is false. 8861 if (Op.getNode()->hasOneUse()) { 8862 SDNode *User = *Op.getNode()->use_begin(); 8863 // Look for an unconditional branch following this conditional branch. 8864 // We need this because we need to reverse the successors in order 8865 // to implement FCMP_UNE. 8866 if (User->getOpcode() == ISD::BR) { 8867 SDValue FalseBB = User->getOperand(1); 8868 SDNode *NewBR = 8869 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8870 assert(NewBR == User); 8871 (void)NewBR; 8872 8873 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 8874 Cond.getOperand(0), Cond.getOperand(1)); 8875 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8876 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8877 Chain, Dest, CC, Cmp); 8878 CC = DAG.getConstant(X86::COND_NP, MVT::i8); 8879 Cond = Cmp; 8880 addTest = false; 8881 Dest = FalseBB; 8882 } 8883 } 8884 } 8885 } 8886 8887 if (addTest) { 8888 // Look pass the truncate. 8889 if (Cond.getOpcode() == ISD::TRUNCATE) 8890 Cond = Cond.getOperand(0); 8891 8892 // We know the result of AND is compared against zero. Try to match 8893 // it to BT. 8894 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8895 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 8896 if (NewSetCC.getNode()) { 8897 CC = NewSetCC.getOperand(0); 8898 Cond = NewSetCC.getOperand(1); 8899 addTest = false; 8900 } 8901 } 8902 } 8903 8904 if (addTest) { 8905 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8906 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8907 } 8908 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8909 Chain, Dest, CC, Cond); 8910} 8911 8912 8913// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 8914// Calls to _alloca is needed to probe the stack when allocating more than 4k 8915// bytes in one go. Touching the stack at 4K increments is necessary to ensure 8916// that the guard pages used by the OS virtual memory manager are allocated in 8917// correct sequence. 8918SDValue 8919X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 8920 SelectionDAG &DAG) const { 8921 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || 8922 getTargetMachine().Options.EnableSegmentedStacks) && 8923 "This should be used only on Windows targets or when segmented stacks " 8924 "are being used"); 8925 assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); 8926 DebugLoc dl = Op.getDebugLoc(); 8927 8928 // Get the inputs. 8929 SDValue Chain = Op.getOperand(0); 8930 SDValue Size = Op.getOperand(1); 8931 // FIXME: Ensure alignment here 8932 8933 bool Is64Bit = Subtarget->is64Bit(); 8934 EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; 8935 8936 if (getTargetMachine().Options.EnableSegmentedStacks) { 8937 MachineFunction &MF = DAG.getMachineFunction(); 8938 MachineRegisterInfo &MRI = MF.getRegInfo(); 8939 8940 if (Is64Bit) { 8941 // The 64 bit implementation of segmented stacks needs to clobber both r10 8942 // r11. This makes it impossible to use it along with nested parameters. 8943 const Function *F = MF.getFunction(); 8944 8945 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 8946 I != E; I++) 8947 if (I->hasNestAttr()) 8948 report_fatal_error("Cannot use segmented stacks with functions that " 8949 "have nested arguments."); 8950 } 8951 8952 const TargetRegisterClass *AddrRegClass = 8953 getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); 8954 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 8955 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 8956 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 8957 DAG.getRegister(Vreg, SPTy)); 8958 SDValue Ops1[2] = { Value, Chain }; 8959 return DAG.getMergeValues(Ops1, 2, dl); 8960 } else { 8961 SDValue Flag; 8962 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 8963 8964 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 8965 Flag = Chain.getValue(1); 8966 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8967 8968 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 8969 Flag = Chain.getValue(1); 8970 8971 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 8972 8973 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 8974 return DAG.getMergeValues(Ops1, 2, dl); 8975 } 8976} 8977 8978SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 8979 MachineFunction &MF = DAG.getMachineFunction(); 8980 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 8981 8982 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8983 DebugLoc DL = Op.getDebugLoc(); 8984 8985 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 8986 // vastart just stores the address of the VarArgsFrameIndex slot into the 8987 // memory location argument. 8988 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8989 getPointerTy()); 8990 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 8991 MachinePointerInfo(SV), false, false, 0); 8992 } 8993 8994 // __va_list_tag: 8995 // gp_offset (0 - 6 * 8) 8996 // fp_offset (48 - 48 + 8 * 16) 8997 // overflow_arg_area (point to parameters coming in memory). 8998 // reg_save_area 8999 SmallVector<SDValue, 8> MemOps; 9000 SDValue FIN = Op.getOperand(1); 9001 // Store gp_offset 9002 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 9003 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 9004 MVT::i32), 9005 FIN, MachinePointerInfo(SV), false, false, 0); 9006 MemOps.push_back(Store); 9007 9008 // Store fp_offset 9009 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9010 FIN, DAG.getIntPtrConstant(4)); 9011 Store = DAG.getStore(Op.getOperand(0), DL, 9012 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 9013 MVT::i32), 9014 FIN, MachinePointerInfo(SV, 4), false, false, 0); 9015 MemOps.push_back(Store); 9016 9017 // Store ptr to overflow_arg_area 9018 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9019 FIN, DAG.getIntPtrConstant(4)); 9020 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 9021 getPointerTy()); 9022 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 9023 MachinePointerInfo(SV, 8), 9024 false, false, 0); 9025 MemOps.push_back(Store); 9026 9027 // Store ptr to reg_save_area. 9028 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9029 FIN, DAG.getIntPtrConstant(8)); 9030 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 9031 getPointerTy()); 9032 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 9033 MachinePointerInfo(SV, 16), false, false, 0); 9034 MemOps.push_back(Store); 9035 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 9036 &MemOps[0], MemOps.size()); 9037} 9038 9039SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 9040 assert(Subtarget->is64Bit() && 9041 "LowerVAARG only handles 64-bit va_arg!"); 9042 assert((Subtarget->isTargetLinux() || 9043 Subtarget->isTargetDarwin()) && 9044 "Unhandled target in LowerVAARG"); 9045 assert(Op.getNode()->getNumOperands() == 4); 9046 SDValue Chain = Op.getOperand(0); 9047 SDValue SrcPtr = Op.getOperand(1); 9048 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9049 unsigned Align = Op.getConstantOperandVal(3); 9050 DebugLoc dl = Op.getDebugLoc(); 9051 9052 EVT ArgVT = Op.getNode()->getValueType(0); 9053 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 9054 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 9055 uint8_t ArgMode; 9056 9057 // Decide which area this value should be read from. 9058 // TODO: Implement the AMD64 ABI in its entirety. This simple 9059 // selection mechanism works only for the basic types. 9060 if (ArgVT == MVT::f80) { 9061 llvm_unreachable("va_arg for f80 not yet implemented"); 9062 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 9063 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 9064 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 9065 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 9066 } else { 9067 llvm_unreachable("Unhandled argument type in LowerVAARG"); 9068 } 9069 9070 if (ArgMode == 2) { 9071 // Sanity Check: Make sure using fp_offset makes sense. 9072 assert(!getTargetMachine().Options.UseSoftFloat && 9073 !(DAG.getMachineFunction() 9074 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 9075 Subtarget->hasSSE1()); 9076 } 9077 9078 // Insert VAARG_64 node into the DAG 9079 // VAARG_64 returns two values: Variable Argument Address, Chain 9080 SmallVector<SDValue, 11> InstOps; 9081 InstOps.push_back(Chain); 9082 InstOps.push_back(SrcPtr); 9083 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 9084 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 9085 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 9086 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 9087 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 9088 VTs, &InstOps[0], InstOps.size(), 9089 MVT::i64, 9090 MachinePointerInfo(SV), 9091 /*Align=*/0, 9092 /*Volatile=*/false, 9093 /*ReadMem=*/true, 9094 /*WriteMem=*/true); 9095 Chain = VAARG.getValue(1); 9096 9097 // Load the next argument and return it 9098 return DAG.getLoad(ArgVT, dl, 9099 Chain, 9100 VAARG, 9101 MachinePointerInfo(), 9102 false, false, false, 0); 9103} 9104 9105SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 9106 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 9107 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 9108 SDValue Chain = Op.getOperand(0); 9109 SDValue DstPtr = Op.getOperand(1); 9110 SDValue SrcPtr = Op.getOperand(2); 9111 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 9112 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9113 DebugLoc DL = Op.getDebugLoc(); 9114 9115 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 9116 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 9117 false, 9118 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 9119} 9120 9121// getTargetVShiftNOde - Handle vector element shifts where the shift amount 9122// may or may not be a constant. Takes immediate version of shift as input. 9123static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, 9124 SDValue SrcOp, SDValue ShAmt, 9125 SelectionDAG &DAG) { 9126 assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); 9127 9128 if (isa<ConstantSDNode>(ShAmt)) { 9129 switch (Opc) { 9130 default: llvm_unreachable("Unknown target vector shift node"); 9131 case X86ISD::VSHLI: 9132 case X86ISD::VSRLI: 9133 case X86ISD::VSRAI: 9134 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 9135 } 9136 } 9137 9138 // Change opcode to non-immediate version 9139 switch (Opc) { 9140 default: llvm_unreachable("Unknown target vector shift node"); 9141 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 9142 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 9143 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 9144 } 9145 9146 // Need to build a vector containing shift amount 9147 // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 9148 SDValue ShOps[4]; 9149 ShOps[0] = ShAmt; 9150 ShOps[1] = DAG.getConstant(0, MVT::i32); 9151 ShOps[2] = DAG.getUNDEF(MVT::i32); 9152 ShOps[3] = DAG.getUNDEF(MVT::i32); 9153 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); 9154 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 9155 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 9156} 9157 9158SDValue 9159X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 9160 DebugLoc dl = Op.getDebugLoc(); 9161 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9162 switch (IntNo) { 9163 default: return SDValue(); // Don't custom lower most intrinsics. 9164 // Comparison intrinsics. 9165 case Intrinsic::x86_sse_comieq_ss: 9166 case Intrinsic::x86_sse_comilt_ss: 9167 case Intrinsic::x86_sse_comile_ss: 9168 case Intrinsic::x86_sse_comigt_ss: 9169 case Intrinsic::x86_sse_comige_ss: 9170 case Intrinsic::x86_sse_comineq_ss: 9171 case Intrinsic::x86_sse_ucomieq_ss: 9172 case Intrinsic::x86_sse_ucomilt_ss: 9173 case Intrinsic::x86_sse_ucomile_ss: 9174 case Intrinsic::x86_sse_ucomigt_ss: 9175 case Intrinsic::x86_sse_ucomige_ss: 9176 case Intrinsic::x86_sse_ucomineq_ss: 9177 case Intrinsic::x86_sse2_comieq_sd: 9178 case Intrinsic::x86_sse2_comilt_sd: 9179 case Intrinsic::x86_sse2_comile_sd: 9180 case Intrinsic::x86_sse2_comigt_sd: 9181 case Intrinsic::x86_sse2_comige_sd: 9182 case Intrinsic::x86_sse2_comineq_sd: 9183 case Intrinsic::x86_sse2_ucomieq_sd: 9184 case Intrinsic::x86_sse2_ucomilt_sd: 9185 case Intrinsic::x86_sse2_ucomile_sd: 9186 case Intrinsic::x86_sse2_ucomigt_sd: 9187 case Intrinsic::x86_sse2_ucomige_sd: 9188 case Intrinsic::x86_sse2_ucomineq_sd: { 9189 unsigned Opc = 0; 9190 ISD::CondCode CC = ISD::SETCC_INVALID; 9191 switch (IntNo) { 9192 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9193 case Intrinsic::x86_sse_comieq_ss: 9194 case Intrinsic::x86_sse2_comieq_sd: 9195 Opc = X86ISD::COMI; 9196 CC = ISD::SETEQ; 9197 break; 9198 case Intrinsic::x86_sse_comilt_ss: 9199 case Intrinsic::x86_sse2_comilt_sd: 9200 Opc = X86ISD::COMI; 9201 CC = ISD::SETLT; 9202 break; 9203 case Intrinsic::x86_sse_comile_ss: 9204 case Intrinsic::x86_sse2_comile_sd: 9205 Opc = X86ISD::COMI; 9206 CC = ISD::SETLE; 9207 break; 9208 case Intrinsic::x86_sse_comigt_ss: 9209 case Intrinsic::x86_sse2_comigt_sd: 9210 Opc = X86ISD::COMI; 9211 CC = ISD::SETGT; 9212 break; 9213 case Intrinsic::x86_sse_comige_ss: 9214 case Intrinsic::x86_sse2_comige_sd: 9215 Opc = X86ISD::COMI; 9216 CC = ISD::SETGE; 9217 break; 9218 case Intrinsic::x86_sse_comineq_ss: 9219 case Intrinsic::x86_sse2_comineq_sd: 9220 Opc = X86ISD::COMI; 9221 CC = ISD::SETNE; 9222 break; 9223 case Intrinsic::x86_sse_ucomieq_ss: 9224 case Intrinsic::x86_sse2_ucomieq_sd: 9225 Opc = X86ISD::UCOMI; 9226 CC = ISD::SETEQ; 9227 break; 9228 case Intrinsic::x86_sse_ucomilt_ss: 9229 case Intrinsic::x86_sse2_ucomilt_sd: 9230 Opc = X86ISD::UCOMI; 9231 CC = ISD::SETLT; 9232 break; 9233 case Intrinsic::x86_sse_ucomile_ss: 9234 case Intrinsic::x86_sse2_ucomile_sd: 9235 Opc = X86ISD::UCOMI; 9236 CC = ISD::SETLE; 9237 break; 9238 case Intrinsic::x86_sse_ucomigt_ss: 9239 case Intrinsic::x86_sse2_ucomigt_sd: 9240 Opc = X86ISD::UCOMI; 9241 CC = ISD::SETGT; 9242 break; 9243 case Intrinsic::x86_sse_ucomige_ss: 9244 case Intrinsic::x86_sse2_ucomige_sd: 9245 Opc = X86ISD::UCOMI; 9246 CC = ISD::SETGE; 9247 break; 9248 case Intrinsic::x86_sse_ucomineq_ss: 9249 case Intrinsic::x86_sse2_ucomineq_sd: 9250 Opc = X86ISD::UCOMI; 9251 CC = ISD::SETNE; 9252 break; 9253 } 9254 9255 SDValue LHS = Op.getOperand(1); 9256 SDValue RHS = Op.getOperand(2); 9257 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 9258 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 9259 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 9260 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9261 DAG.getConstant(X86CC, MVT::i8), Cond); 9262 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 9263 } 9264 // XOP comparison intrinsics 9265 case Intrinsic::x86_xop_vpcomltb: 9266 case Intrinsic::x86_xop_vpcomltw: 9267 case Intrinsic::x86_xop_vpcomltd: 9268 case Intrinsic::x86_xop_vpcomltq: 9269 case Intrinsic::x86_xop_vpcomltub: 9270 case Intrinsic::x86_xop_vpcomltuw: 9271 case Intrinsic::x86_xop_vpcomltud: 9272 case Intrinsic::x86_xop_vpcomltuq: 9273 case Intrinsic::x86_xop_vpcomleb: 9274 case Intrinsic::x86_xop_vpcomlew: 9275 case Intrinsic::x86_xop_vpcomled: 9276 case Intrinsic::x86_xop_vpcomleq: 9277 case Intrinsic::x86_xop_vpcomleub: 9278 case Intrinsic::x86_xop_vpcomleuw: 9279 case Intrinsic::x86_xop_vpcomleud: 9280 case Intrinsic::x86_xop_vpcomleuq: 9281 case Intrinsic::x86_xop_vpcomgtb: 9282 case Intrinsic::x86_xop_vpcomgtw: 9283 case Intrinsic::x86_xop_vpcomgtd: 9284 case Intrinsic::x86_xop_vpcomgtq: 9285 case Intrinsic::x86_xop_vpcomgtub: 9286 case Intrinsic::x86_xop_vpcomgtuw: 9287 case Intrinsic::x86_xop_vpcomgtud: 9288 case Intrinsic::x86_xop_vpcomgtuq: 9289 case Intrinsic::x86_xop_vpcomgeb: 9290 case Intrinsic::x86_xop_vpcomgew: 9291 case Intrinsic::x86_xop_vpcomged: 9292 case Intrinsic::x86_xop_vpcomgeq: 9293 case Intrinsic::x86_xop_vpcomgeub: 9294 case Intrinsic::x86_xop_vpcomgeuw: 9295 case Intrinsic::x86_xop_vpcomgeud: 9296 case Intrinsic::x86_xop_vpcomgeuq: 9297 case Intrinsic::x86_xop_vpcomeqb: 9298 case Intrinsic::x86_xop_vpcomeqw: 9299 case Intrinsic::x86_xop_vpcomeqd: 9300 case Intrinsic::x86_xop_vpcomeqq: 9301 case Intrinsic::x86_xop_vpcomequb: 9302 case Intrinsic::x86_xop_vpcomequw: 9303 case Intrinsic::x86_xop_vpcomequd: 9304 case Intrinsic::x86_xop_vpcomequq: 9305 case Intrinsic::x86_xop_vpcomneb: 9306 case Intrinsic::x86_xop_vpcomnew: 9307 case Intrinsic::x86_xop_vpcomned: 9308 case Intrinsic::x86_xop_vpcomneq: 9309 case Intrinsic::x86_xop_vpcomneub: 9310 case Intrinsic::x86_xop_vpcomneuw: 9311 case Intrinsic::x86_xop_vpcomneud: 9312 case Intrinsic::x86_xop_vpcomneuq: 9313 case Intrinsic::x86_xop_vpcomfalseb: 9314 case Intrinsic::x86_xop_vpcomfalsew: 9315 case Intrinsic::x86_xop_vpcomfalsed: 9316 case Intrinsic::x86_xop_vpcomfalseq: 9317 case Intrinsic::x86_xop_vpcomfalseub: 9318 case Intrinsic::x86_xop_vpcomfalseuw: 9319 case Intrinsic::x86_xop_vpcomfalseud: 9320 case Intrinsic::x86_xop_vpcomfalseuq: 9321 case Intrinsic::x86_xop_vpcomtrueb: 9322 case Intrinsic::x86_xop_vpcomtruew: 9323 case Intrinsic::x86_xop_vpcomtrued: 9324 case Intrinsic::x86_xop_vpcomtrueq: 9325 case Intrinsic::x86_xop_vpcomtrueub: 9326 case Intrinsic::x86_xop_vpcomtrueuw: 9327 case Intrinsic::x86_xop_vpcomtrueud: 9328 case Intrinsic::x86_xop_vpcomtrueuq: { 9329 unsigned CC = 0; 9330 unsigned Opc = 0; 9331 9332 switch (IntNo) { 9333 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9334 case Intrinsic::x86_xop_vpcomltb: 9335 case Intrinsic::x86_xop_vpcomltw: 9336 case Intrinsic::x86_xop_vpcomltd: 9337 case Intrinsic::x86_xop_vpcomltq: 9338 CC = 0; 9339 Opc = X86ISD::VPCOM; 9340 break; 9341 case Intrinsic::x86_xop_vpcomltub: 9342 case Intrinsic::x86_xop_vpcomltuw: 9343 case Intrinsic::x86_xop_vpcomltud: 9344 case Intrinsic::x86_xop_vpcomltuq: 9345 CC = 0; 9346 Opc = X86ISD::VPCOMU; 9347 break; 9348 case Intrinsic::x86_xop_vpcomleb: 9349 case Intrinsic::x86_xop_vpcomlew: 9350 case Intrinsic::x86_xop_vpcomled: 9351 case Intrinsic::x86_xop_vpcomleq: 9352 CC = 1; 9353 Opc = X86ISD::VPCOM; 9354 break; 9355 case Intrinsic::x86_xop_vpcomleub: 9356 case Intrinsic::x86_xop_vpcomleuw: 9357 case Intrinsic::x86_xop_vpcomleud: 9358 case Intrinsic::x86_xop_vpcomleuq: 9359 CC = 1; 9360 Opc = X86ISD::VPCOMU; 9361 break; 9362 case Intrinsic::x86_xop_vpcomgtb: 9363 case Intrinsic::x86_xop_vpcomgtw: 9364 case Intrinsic::x86_xop_vpcomgtd: 9365 case Intrinsic::x86_xop_vpcomgtq: 9366 CC = 2; 9367 Opc = X86ISD::VPCOM; 9368 break; 9369 case Intrinsic::x86_xop_vpcomgtub: 9370 case Intrinsic::x86_xop_vpcomgtuw: 9371 case Intrinsic::x86_xop_vpcomgtud: 9372 case Intrinsic::x86_xop_vpcomgtuq: 9373 CC = 2; 9374 Opc = X86ISD::VPCOMU; 9375 break; 9376 case Intrinsic::x86_xop_vpcomgeb: 9377 case Intrinsic::x86_xop_vpcomgew: 9378 case Intrinsic::x86_xop_vpcomged: 9379 case Intrinsic::x86_xop_vpcomgeq: 9380 CC = 3; 9381 Opc = X86ISD::VPCOM; 9382 break; 9383 case Intrinsic::x86_xop_vpcomgeub: 9384 case Intrinsic::x86_xop_vpcomgeuw: 9385 case Intrinsic::x86_xop_vpcomgeud: 9386 case Intrinsic::x86_xop_vpcomgeuq: 9387 CC = 3; 9388 Opc = X86ISD::VPCOMU; 9389 break; 9390 case Intrinsic::x86_xop_vpcomeqb: 9391 case Intrinsic::x86_xop_vpcomeqw: 9392 case Intrinsic::x86_xop_vpcomeqd: 9393 case Intrinsic::x86_xop_vpcomeqq: 9394 CC = 4; 9395 Opc = X86ISD::VPCOM; 9396 break; 9397 case Intrinsic::x86_xop_vpcomequb: 9398 case Intrinsic::x86_xop_vpcomequw: 9399 case Intrinsic::x86_xop_vpcomequd: 9400 case Intrinsic::x86_xop_vpcomequq: 9401 CC = 4; 9402 Opc = X86ISD::VPCOMU; 9403 break; 9404 case Intrinsic::x86_xop_vpcomneb: 9405 case Intrinsic::x86_xop_vpcomnew: 9406 case Intrinsic::x86_xop_vpcomned: 9407 case Intrinsic::x86_xop_vpcomneq: 9408 CC = 5; 9409 Opc = X86ISD::VPCOM; 9410 break; 9411 case Intrinsic::x86_xop_vpcomneub: 9412 case Intrinsic::x86_xop_vpcomneuw: 9413 case Intrinsic::x86_xop_vpcomneud: 9414 case Intrinsic::x86_xop_vpcomneuq: 9415 CC = 5; 9416 Opc = X86ISD::VPCOMU; 9417 break; 9418 case Intrinsic::x86_xop_vpcomfalseb: 9419 case Intrinsic::x86_xop_vpcomfalsew: 9420 case Intrinsic::x86_xop_vpcomfalsed: 9421 case Intrinsic::x86_xop_vpcomfalseq: 9422 CC = 6; 9423 Opc = X86ISD::VPCOM; 9424 break; 9425 case Intrinsic::x86_xop_vpcomfalseub: 9426 case Intrinsic::x86_xop_vpcomfalseuw: 9427 case Intrinsic::x86_xop_vpcomfalseud: 9428 case Intrinsic::x86_xop_vpcomfalseuq: 9429 CC = 6; 9430 Opc = X86ISD::VPCOMU; 9431 break; 9432 case Intrinsic::x86_xop_vpcomtrueb: 9433 case Intrinsic::x86_xop_vpcomtruew: 9434 case Intrinsic::x86_xop_vpcomtrued: 9435 case Intrinsic::x86_xop_vpcomtrueq: 9436 CC = 7; 9437 Opc = X86ISD::VPCOM; 9438 break; 9439 case Intrinsic::x86_xop_vpcomtrueub: 9440 case Intrinsic::x86_xop_vpcomtrueuw: 9441 case Intrinsic::x86_xop_vpcomtrueud: 9442 case Intrinsic::x86_xop_vpcomtrueuq: 9443 CC = 7; 9444 Opc = X86ISD::VPCOMU; 9445 break; 9446 } 9447 9448 SDValue LHS = Op.getOperand(1); 9449 SDValue RHS = Op.getOperand(2); 9450 return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS, 9451 DAG.getConstant(CC, MVT::i8)); 9452 } 9453 9454 // Arithmetic intrinsics. 9455 case Intrinsic::x86_sse2_pmulu_dq: 9456 case Intrinsic::x86_avx2_pmulu_dq: 9457 return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), 9458 Op.getOperand(1), Op.getOperand(2)); 9459 case Intrinsic::x86_sse3_hadd_ps: 9460 case Intrinsic::x86_sse3_hadd_pd: 9461 case Intrinsic::x86_avx_hadd_ps_256: 9462 case Intrinsic::x86_avx_hadd_pd_256: 9463 return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(), 9464 Op.getOperand(1), Op.getOperand(2)); 9465 case Intrinsic::x86_sse3_hsub_ps: 9466 case Intrinsic::x86_sse3_hsub_pd: 9467 case Intrinsic::x86_avx_hsub_ps_256: 9468 case Intrinsic::x86_avx_hsub_pd_256: 9469 return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(), 9470 Op.getOperand(1), Op.getOperand(2)); 9471 case Intrinsic::x86_ssse3_phadd_w_128: 9472 case Intrinsic::x86_ssse3_phadd_d_128: 9473 case Intrinsic::x86_avx2_phadd_w: 9474 case Intrinsic::x86_avx2_phadd_d: 9475 return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(), 9476 Op.getOperand(1), Op.getOperand(2)); 9477 case Intrinsic::x86_ssse3_phsub_w_128: 9478 case Intrinsic::x86_ssse3_phsub_d_128: 9479 case Intrinsic::x86_avx2_phsub_w: 9480 case Intrinsic::x86_avx2_phsub_d: 9481 return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(), 9482 Op.getOperand(1), Op.getOperand(2)); 9483 case Intrinsic::x86_avx2_psllv_d: 9484 case Intrinsic::x86_avx2_psllv_q: 9485 case Intrinsic::x86_avx2_psllv_d_256: 9486 case Intrinsic::x86_avx2_psllv_q_256: 9487 return DAG.getNode(ISD::SHL, dl, Op.getValueType(), 9488 Op.getOperand(1), Op.getOperand(2)); 9489 case Intrinsic::x86_avx2_psrlv_d: 9490 case Intrinsic::x86_avx2_psrlv_q: 9491 case Intrinsic::x86_avx2_psrlv_d_256: 9492 case Intrinsic::x86_avx2_psrlv_q_256: 9493 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), 9494 Op.getOperand(1), Op.getOperand(2)); 9495 case Intrinsic::x86_avx2_psrav_d: 9496 case Intrinsic::x86_avx2_psrav_d_256: 9497 return DAG.getNode(ISD::SRA, dl, Op.getValueType(), 9498 Op.getOperand(1), Op.getOperand(2)); 9499 case Intrinsic::x86_ssse3_pshuf_b_128: 9500 case Intrinsic::x86_avx2_pshuf_b: 9501 return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), 9502 Op.getOperand(1), Op.getOperand(2)); 9503 case Intrinsic::x86_ssse3_psign_b_128: 9504 case Intrinsic::x86_ssse3_psign_w_128: 9505 case Intrinsic::x86_ssse3_psign_d_128: 9506 case Intrinsic::x86_avx2_psign_b: 9507 case Intrinsic::x86_avx2_psign_w: 9508 case Intrinsic::x86_avx2_psign_d: 9509 return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), 9510 Op.getOperand(1), Op.getOperand(2)); 9511 case Intrinsic::x86_sse41_insertps: 9512 return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), 9513 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 9514 case Intrinsic::x86_avx_vperm2f128_ps_256: 9515 case Intrinsic::x86_avx_vperm2f128_pd_256: 9516 case Intrinsic::x86_avx_vperm2f128_si_256: 9517 case Intrinsic::x86_avx2_vperm2i128: 9518 return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), 9519 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 9520 case Intrinsic::x86_avx_vpermil_ps: 9521 case Intrinsic::x86_avx_vpermil_pd: 9522 case Intrinsic::x86_avx_vpermil_ps_256: 9523 case Intrinsic::x86_avx_vpermil_pd_256: 9524 return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(), 9525 Op.getOperand(1), Op.getOperand(2)); 9526 9527 // ptest and testp intrinsics. The intrinsic these come from are designed to 9528 // return an integer value, not just an instruction so lower it to the ptest 9529 // or testp pattern and a setcc for the result. 9530 case Intrinsic::x86_sse41_ptestz: 9531 case Intrinsic::x86_sse41_ptestc: 9532 case Intrinsic::x86_sse41_ptestnzc: 9533 case Intrinsic::x86_avx_ptestz_256: 9534 case Intrinsic::x86_avx_ptestc_256: 9535 case Intrinsic::x86_avx_ptestnzc_256: 9536 case Intrinsic::x86_avx_vtestz_ps: 9537 case Intrinsic::x86_avx_vtestc_ps: 9538 case Intrinsic::x86_avx_vtestnzc_ps: 9539 case Intrinsic::x86_avx_vtestz_pd: 9540 case Intrinsic::x86_avx_vtestc_pd: 9541 case Intrinsic::x86_avx_vtestnzc_pd: 9542 case Intrinsic::x86_avx_vtestz_ps_256: 9543 case Intrinsic::x86_avx_vtestc_ps_256: 9544 case Intrinsic::x86_avx_vtestnzc_ps_256: 9545 case Intrinsic::x86_avx_vtestz_pd_256: 9546 case Intrinsic::x86_avx_vtestc_pd_256: 9547 case Intrinsic::x86_avx_vtestnzc_pd_256: { 9548 bool IsTestPacked = false; 9549 unsigned X86CC = 0; 9550 switch (IntNo) { 9551 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 9552 case Intrinsic::x86_avx_vtestz_ps: 9553 case Intrinsic::x86_avx_vtestz_pd: 9554 case Intrinsic::x86_avx_vtestz_ps_256: 9555 case Intrinsic::x86_avx_vtestz_pd_256: 9556 IsTestPacked = true; // Fallthrough 9557 case Intrinsic::x86_sse41_ptestz: 9558 case Intrinsic::x86_avx_ptestz_256: 9559 // ZF = 1 9560 X86CC = X86::COND_E; 9561 break; 9562 case Intrinsic::x86_avx_vtestc_ps: 9563 case Intrinsic::x86_avx_vtestc_pd: 9564 case Intrinsic::x86_avx_vtestc_ps_256: 9565 case Intrinsic::x86_avx_vtestc_pd_256: 9566 IsTestPacked = true; // Fallthrough 9567 case Intrinsic::x86_sse41_ptestc: 9568 case Intrinsic::x86_avx_ptestc_256: 9569 // CF = 1 9570 X86CC = X86::COND_B; 9571 break; 9572 case Intrinsic::x86_avx_vtestnzc_ps: 9573 case Intrinsic::x86_avx_vtestnzc_pd: 9574 case Intrinsic::x86_avx_vtestnzc_ps_256: 9575 case Intrinsic::x86_avx_vtestnzc_pd_256: 9576 IsTestPacked = true; // Fallthrough 9577 case Intrinsic::x86_sse41_ptestnzc: 9578 case Intrinsic::x86_avx_ptestnzc_256: 9579 // ZF and CF = 0 9580 X86CC = X86::COND_A; 9581 break; 9582 } 9583 9584 SDValue LHS = Op.getOperand(1); 9585 SDValue RHS = Op.getOperand(2); 9586 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 9587 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 9588 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 9589 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 9590 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 9591 } 9592 9593 // SSE/AVX shift intrinsics 9594 case Intrinsic::x86_sse2_psll_w: 9595 case Intrinsic::x86_sse2_psll_d: 9596 case Intrinsic::x86_sse2_psll_q: 9597 case Intrinsic::x86_avx2_psll_w: 9598 case Intrinsic::x86_avx2_psll_d: 9599 case Intrinsic::x86_avx2_psll_q: 9600 return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(), 9601 Op.getOperand(1), Op.getOperand(2)); 9602 case Intrinsic::x86_sse2_psrl_w: 9603 case Intrinsic::x86_sse2_psrl_d: 9604 case Intrinsic::x86_sse2_psrl_q: 9605 case Intrinsic::x86_avx2_psrl_w: 9606 case Intrinsic::x86_avx2_psrl_d: 9607 case Intrinsic::x86_avx2_psrl_q: 9608 return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(), 9609 Op.getOperand(1), Op.getOperand(2)); 9610 case Intrinsic::x86_sse2_psra_w: 9611 case Intrinsic::x86_sse2_psra_d: 9612 case Intrinsic::x86_avx2_psra_w: 9613 case Intrinsic::x86_avx2_psra_d: 9614 return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(), 9615 Op.getOperand(1), Op.getOperand(2)); 9616 case Intrinsic::x86_sse2_pslli_w: 9617 case Intrinsic::x86_sse2_pslli_d: 9618 case Intrinsic::x86_sse2_pslli_q: 9619 case Intrinsic::x86_avx2_pslli_w: 9620 case Intrinsic::x86_avx2_pslli_d: 9621 case Intrinsic::x86_avx2_pslli_q: 9622 return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(), 9623 Op.getOperand(1), Op.getOperand(2), DAG); 9624 case Intrinsic::x86_sse2_psrli_w: 9625 case Intrinsic::x86_sse2_psrli_d: 9626 case Intrinsic::x86_sse2_psrli_q: 9627 case Intrinsic::x86_avx2_psrli_w: 9628 case Intrinsic::x86_avx2_psrli_d: 9629 case Intrinsic::x86_avx2_psrli_q: 9630 return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(), 9631 Op.getOperand(1), Op.getOperand(2), DAG); 9632 case Intrinsic::x86_sse2_psrai_w: 9633 case Intrinsic::x86_sse2_psrai_d: 9634 case Intrinsic::x86_avx2_psrai_w: 9635 case Intrinsic::x86_avx2_psrai_d: 9636 return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(), 9637 Op.getOperand(1), Op.getOperand(2), DAG); 9638 // Fix vector shift instructions where the last operand is a non-immediate 9639 // i32 value. 9640 case Intrinsic::x86_mmx_pslli_w: 9641 case Intrinsic::x86_mmx_pslli_d: 9642 case Intrinsic::x86_mmx_pslli_q: 9643 case Intrinsic::x86_mmx_psrli_w: 9644 case Intrinsic::x86_mmx_psrli_d: 9645 case Intrinsic::x86_mmx_psrli_q: 9646 case Intrinsic::x86_mmx_psrai_w: 9647 case Intrinsic::x86_mmx_psrai_d: { 9648 SDValue ShAmt = Op.getOperand(2); 9649 if (isa<ConstantSDNode>(ShAmt)) 9650 return SDValue(); 9651 9652 unsigned NewIntNo = 0; 9653 switch (IntNo) { 9654 case Intrinsic::x86_mmx_pslli_w: 9655 NewIntNo = Intrinsic::x86_mmx_psll_w; 9656 break; 9657 case Intrinsic::x86_mmx_pslli_d: 9658 NewIntNo = Intrinsic::x86_mmx_psll_d; 9659 break; 9660 case Intrinsic::x86_mmx_pslli_q: 9661 NewIntNo = Intrinsic::x86_mmx_psll_q; 9662 break; 9663 case Intrinsic::x86_mmx_psrli_w: 9664 NewIntNo = Intrinsic::x86_mmx_psrl_w; 9665 break; 9666 case Intrinsic::x86_mmx_psrli_d: 9667 NewIntNo = Intrinsic::x86_mmx_psrl_d; 9668 break; 9669 case Intrinsic::x86_mmx_psrli_q: 9670 NewIntNo = Intrinsic::x86_mmx_psrl_q; 9671 break; 9672 case Intrinsic::x86_mmx_psrai_w: 9673 NewIntNo = Intrinsic::x86_mmx_psra_w; 9674 break; 9675 case Intrinsic::x86_mmx_psrai_d: 9676 NewIntNo = Intrinsic::x86_mmx_psra_d; 9677 break; 9678 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9679 } 9680 9681 // The vector shift intrinsics with scalars uses 32b shift amounts but 9682 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 9683 // to be zero. 9684 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt, 9685 DAG.getConstant(0, MVT::i32)); 9686// FIXME this must be lowered to get rid of the invalid type. 9687 9688 EVT VT = Op.getValueType(); 9689 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 9690 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9691 DAG.getConstant(NewIntNo, MVT::i32), 9692 Op.getOperand(1), ShAmt); 9693 } 9694 } 9695} 9696 9697SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 9698 SelectionDAG &DAG) const { 9699 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 9700 MFI->setReturnAddressIsTaken(true); 9701 9702 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9703 DebugLoc dl = Op.getDebugLoc(); 9704 9705 if (Depth > 0) { 9706 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 9707 SDValue Offset = 9708 DAG.getConstant(TD->getPointerSize(), 9709 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 9710 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 9711 DAG.getNode(ISD::ADD, dl, getPointerTy(), 9712 FrameAddr, Offset), 9713 MachinePointerInfo(), false, false, false, 0); 9714 } 9715 9716 // Just load the return address. 9717 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 9718 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 9719 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 9720} 9721 9722SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 9723 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 9724 MFI->setFrameAddressIsTaken(true); 9725 9726 EVT VT = Op.getValueType(); 9727 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 9728 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9729 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 9730 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 9731 while (Depth--) 9732 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 9733 MachinePointerInfo(), 9734 false, false, false, 0); 9735 return FrameAddr; 9736} 9737 9738SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 9739 SelectionDAG &DAG) const { 9740 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 9741} 9742 9743SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 9744 MachineFunction &MF = DAG.getMachineFunction(); 9745 SDValue Chain = Op.getOperand(0); 9746 SDValue Offset = Op.getOperand(1); 9747 SDValue Handler = Op.getOperand(2); 9748 DebugLoc dl = Op.getDebugLoc(); 9749 9750 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 9751 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 9752 getPointerTy()); 9753 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 9754 9755 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 9756 DAG.getIntPtrConstant(TD->getPointerSize())); 9757 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 9758 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 9759 false, false, 0); 9760 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 9761 MF.getRegInfo().addLiveOut(StoreAddrReg); 9762 9763 return DAG.getNode(X86ISD::EH_RETURN, dl, 9764 MVT::Other, 9765 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 9766} 9767 9768SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, 9769 SelectionDAG &DAG) const { 9770 return Op.getOperand(0); 9771} 9772 9773SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 9774 SelectionDAG &DAG) const { 9775 SDValue Root = Op.getOperand(0); 9776 SDValue Trmp = Op.getOperand(1); // trampoline 9777 SDValue FPtr = Op.getOperand(2); // nested function 9778 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 9779 DebugLoc dl = Op.getDebugLoc(); 9780 9781 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9782 9783 if (Subtarget->is64Bit()) { 9784 SDValue OutChains[6]; 9785 9786 // Large code-model. 9787 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 9788 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 9789 9790 const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); 9791 const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); 9792 9793 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 9794 9795 // Load the pointer to the nested function into R11. 9796 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 9797 SDValue Addr = Trmp; 9798 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9799 Addr, MachinePointerInfo(TrmpAddr), 9800 false, false, 0); 9801 9802 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9803 DAG.getConstant(2, MVT::i64)); 9804 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 9805 MachinePointerInfo(TrmpAddr, 2), 9806 false, false, 2); 9807 9808 // Load the 'nest' parameter value into R10. 9809 // R10 is specified in X86CallingConv.td 9810 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 9811 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9812 DAG.getConstant(10, MVT::i64)); 9813 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9814 Addr, MachinePointerInfo(TrmpAddr, 10), 9815 false, false, 0); 9816 9817 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9818 DAG.getConstant(12, MVT::i64)); 9819 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 9820 MachinePointerInfo(TrmpAddr, 12), 9821 false, false, 2); 9822 9823 // Jump to the nested function. 9824 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 9825 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9826 DAG.getConstant(20, MVT::i64)); 9827 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9828 Addr, MachinePointerInfo(TrmpAddr, 20), 9829 false, false, 0); 9830 9831 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 9832 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9833 DAG.getConstant(22, MVT::i64)); 9834 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 9835 MachinePointerInfo(TrmpAddr, 22), 9836 false, false, 0); 9837 9838 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); 9839 } else { 9840 const Function *Func = 9841 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 9842 CallingConv::ID CC = Func->getCallingConv(); 9843 unsigned NestReg; 9844 9845 switch (CC) { 9846 default: 9847 llvm_unreachable("Unsupported calling convention"); 9848 case CallingConv::C: 9849 case CallingConv::X86_StdCall: { 9850 // Pass 'nest' parameter in ECX. 9851 // Must be kept in sync with X86CallingConv.td 9852 NestReg = X86::ECX; 9853 9854 // Check that ECX wasn't needed by an 'inreg' parameter. 9855 FunctionType *FTy = Func->getFunctionType(); 9856 const AttrListPtr &Attrs = Func->getAttributes(); 9857 9858 if (!Attrs.isEmpty() && !Func->isVarArg()) { 9859 unsigned InRegCount = 0; 9860 unsigned Idx = 1; 9861 9862 for (FunctionType::param_iterator I = FTy->param_begin(), 9863 E = FTy->param_end(); I != E; ++I, ++Idx) 9864 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 9865 // FIXME: should only count parameters that are lowered to integers. 9866 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 9867 9868 if (InRegCount > 2) { 9869 report_fatal_error("Nest register in use - reduce number of inreg" 9870 " parameters!"); 9871 } 9872 } 9873 break; 9874 } 9875 case CallingConv::X86_FastCall: 9876 case CallingConv::X86_ThisCall: 9877 case CallingConv::Fast: 9878 // Pass 'nest' parameter in EAX. 9879 // Must be kept in sync with X86CallingConv.td 9880 NestReg = X86::EAX; 9881 break; 9882 } 9883 9884 SDValue OutChains[4]; 9885 SDValue Addr, Disp; 9886 9887 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9888 DAG.getConstant(10, MVT::i32)); 9889 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 9890 9891 // This is storing the opcode for MOV32ri. 9892 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 9893 const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); 9894 OutChains[0] = DAG.getStore(Root, dl, 9895 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 9896 Trmp, MachinePointerInfo(TrmpAddr), 9897 false, false, 0); 9898 9899 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9900 DAG.getConstant(1, MVT::i32)); 9901 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 9902 MachinePointerInfo(TrmpAddr, 1), 9903 false, false, 1); 9904 9905 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 9906 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9907 DAG.getConstant(5, MVT::i32)); 9908 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 9909 MachinePointerInfo(TrmpAddr, 5), 9910 false, false, 1); 9911 9912 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9913 DAG.getConstant(6, MVT::i32)); 9914 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 9915 MachinePointerInfo(TrmpAddr, 6), 9916 false, false, 1); 9917 9918 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); 9919 } 9920} 9921 9922SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 9923 SelectionDAG &DAG) const { 9924 /* 9925 The rounding mode is in bits 11:10 of FPSR, and has the following 9926 settings: 9927 00 Round to nearest 9928 01 Round to -inf 9929 10 Round to +inf 9930 11 Round to 0 9931 9932 FLT_ROUNDS, on the other hand, expects the following: 9933 -1 Undefined 9934 0 Round to 0 9935 1 Round to nearest 9936 2 Round to +inf 9937 3 Round to -inf 9938 9939 To perform the conversion, we do: 9940 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 9941 */ 9942 9943 MachineFunction &MF = DAG.getMachineFunction(); 9944 const TargetMachine &TM = MF.getTarget(); 9945 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 9946 unsigned StackAlignment = TFI.getStackAlignment(); 9947 EVT VT = Op.getValueType(); 9948 DebugLoc DL = Op.getDebugLoc(); 9949 9950 // Save FP Control Word to stack slot 9951 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 9952 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 9953 9954 9955 MachineMemOperand *MMO = 9956 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 9957 MachineMemOperand::MOStore, 2, 2); 9958 9959 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 9960 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 9961 DAG.getVTList(MVT::Other), 9962 Ops, 2, MVT::i16, MMO); 9963 9964 // Load FP Control Word from stack slot 9965 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 9966 MachinePointerInfo(), false, false, false, 0); 9967 9968 // Transform as necessary 9969 SDValue CWD1 = 9970 DAG.getNode(ISD::SRL, DL, MVT::i16, 9971 DAG.getNode(ISD::AND, DL, MVT::i16, 9972 CWD, DAG.getConstant(0x800, MVT::i16)), 9973 DAG.getConstant(11, MVT::i8)); 9974 SDValue CWD2 = 9975 DAG.getNode(ISD::SRL, DL, MVT::i16, 9976 DAG.getNode(ISD::AND, DL, MVT::i16, 9977 CWD, DAG.getConstant(0x400, MVT::i16)), 9978 DAG.getConstant(9, MVT::i8)); 9979 9980 SDValue RetVal = 9981 DAG.getNode(ISD::AND, DL, MVT::i16, 9982 DAG.getNode(ISD::ADD, DL, MVT::i16, 9983 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 9984 DAG.getConstant(1, MVT::i16)), 9985 DAG.getConstant(3, MVT::i16)); 9986 9987 9988 return DAG.getNode((VT.getSizeInBits() < 16 ? 9989 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 9990} 9991 9992SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 9993 EVT VT = Op.getValueType(); 9994 EVT OpVT = VT; 9995 unsigned NumBits = VT.getSizeInBits(); 9996 DebugLoc dl = Op.getDebugLoc(); 9997 9998 Op = Op.getOperand(0); 9999 if (VT == MVT::i8) { 10000 // Zero extend to i32 since there is not an i8 bsr. 10001 OpVT = MVT::i32; 10002 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 10003 } 10004 10005 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 10006 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 10007 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 10008 10009 // If src is zero (i.e. bsr sets ZF), returns NumBits. 10010 SDValue Ops[] = { 10011 Op, 10012 DAG.getConstant(NumBits+NumBits-1, OpVT), 10013 DAG.getConstant(X86::COND_E, MVT::i8), 10014 Op.getValue(1) 10015 }; 10016 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 10017 10018 // Finally xor with NumBits-1. 10019 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 10020 10021 if (VT == MVT::i8) 10022 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 10023 return Op; 10024} 10025 10026SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, 10027 SelectionDAG &DAG) const { 10028 EVT VT = Op.getValueType(); 10029 EVT OpVT = VT; 10030 unsigned NumBits = VT.getSizeInBits(); 10031 DebugLoc dl = Op.getDebugLoc(); 10032 10033 Op = Op.getOperand(0); 10034 if (VT == MVT::i8) { 10035 // Zero extend to i32 since there is not an i8 bsr. 10036 OpVT = MVT::i32; 10037 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 10038 } 10039 10040 // Issue a bsr (scan bits in reverse). 10041 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 10042 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 10043 10044 // And xor with NumBits-1. 10045 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 10046 10047 if (VT == MVT::i8) 10048 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 10049 return Op; 10050} 10051 10052SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 10053 EVT VT = Op.getValueType(); 10054 unsigned NumBits = VT.getSizeInBits(); 10055 DebugLoc dl = Op.getDebugLoc(); 10056 Op = Op.getOperand(0); 10057 10058 // Issue a bsf (scan bits forward) which also sets EFLAGS. 10059 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 10060 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 10061 10062 // If src is zero (i.e. bsf sets ZF), returns NumBits. 10063 SDValue Ops[] = { 10064 Op, 10065 DAG.getConstant(NumBits, VT), 10066 DAG.getConstant(X86::COND_E, MVT::i8), 10067 Op.getValue(1) 10068 }; 10069 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); 10070} 10071 10072// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 10073// ones, and then concatenate the result back. 10074static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 10075 EVT VT = Op.getValueType(); 10076 10077 assert(VT.getSizeInBits() == 256 && VT.isInteger() && 10078 "Unsupported value type for operation"); 10079 10080 int NumElems = VT.getVectorNumElements(); 10081 DebugLoc dl = Op.getDebugLoc(); 10082 SDValue Idx0 = DAG.getConstant(0, MVT::i32); 10083 SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); 10084 10085 // Extract the LHS vectors 10086 SDValue LHS = Op.getOperand(0); 10087 SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); 10088 SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); 10089 10090 // Extract the RHS vectors 10091 SDValue RHS = Op.getOperand(1); 10092 SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); 10093 SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); 10094 10095 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10096 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 10097 10098 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 10099 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 10100 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 10101} 10102 10103SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const { 10104 assert(Op.getValueType().getSizeInBits() == 256 && 10105 Op.getValueType().isInteger() && 10106 "Only handle AVX 256-bit vector integer operation"); 10107 return Lower256IntArith(Op, DAG); 10108} 10109 10110SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const { 10111 assert(Op.getValueType().getSizeInBits() == 256 && 10112 Op.getValueType().isInteger() && 10113 "Only handle AVX 256-bit vector integer operation"); 10114 return Lower256IntArith(Op, DAG); 10115} 10116 10117SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 10118 EVT VT = Op.getValueType(); 10119 10120 // Decompose 256-bit ops into smaller 128-bit ops. 10121 if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) 10122 return Lower256IntArith(Op, DAG); 10123 10124 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && 10125 "Only know how to lower V2I64/V4I64 multiply"); 10126 10127 DebugLoc dl = Op.getDebugLoc(); 10128 10129 // Ahi = psrlqi(a, 32); 10130 // Bhi = psrlqi(b, 32); 10131 // 10132 // AloBlo = pmuludq(a, b); 10133 // AloBhi = pmuludq(a, Bhi); 10134 // AhiBlo = pmuludq(Ahi, b); 10135 10136 // AloBhi = psllqi(AloBhi, 32); 10137 // AhiBlo = psllqi(AhiBlo, 32); 10138 // return AloBlo + AloBhi + AhiBlo; 10139 10140 SDValue A = Op.getOperand(0); 10141 SDValue B = Op.getOperand(1); 10142 10143 SDValue ShAmt = DAG.getConstant(32, MVT::i32); 10144 10145 SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); 10146 SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); 10147 10148 // Bit cast to 32-bit vectors for MULUDQ 10149 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; 10150 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); 10151 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); 10152 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); 10153 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); 10154 10155 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 10156 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 10157 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 10158 10159 AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); 10160 AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); 10161 10162 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 10163 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 10164} 10165 10166SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 10167 10168 EVT VT = Op.getValueType(); 10169 DebugLoc dl = Op.getDebugLoc(); 10170 SDValue R = Op.getOperand(0); 10171 SDValue Amt = Op.getOperand(1); 10172 LLVMContext *Context = DAG.getContext(); 10173 10174 if (!Subtarget->hasSSE2()) 10175 return SDValue(); 10176 10177 // Optimize shl/srl/sra with constant shift amount. 10178 if (isSplatVector(Amt.getNode())) { 10179 SDValue SclrAmt = Amt->getOperand(0); 10180 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 10181 uint64_t ShiftAmt = C->getZExtValue(); 10182 10183 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 10184 (Subtarget->hasAVX2() && 10185 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { 10186 if (Op.getOpcode() == ISD::SHL) 10187 return DAG.getNode(X86ISD::VSHLI, dl, VT, R, 10188 DAG.getConstant(ShiftAmt, MVT::i32)); 10189 if (Op.getOpcode() == ISD::SRL) 10190 return DAG.getNode(X86ISD::VSRLI, dl, VT, R, 10191 DAG.getConstant(ShiftAmt, MVT::i32)); 10192 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) 10193 return DAG.getNode(X86ISD::VSRAI, dl, VT, R, 10194 DAG.getConstant(ShiftAmt, MVT::i32)); 10195 } 10196 10197 if (VT == MVT::v16i8) { 10198 if (Op.getOpcode() == ISD::SHL) { 10199 // Make a large shift. 10200 SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R, 10201 DAG.getConstant(ShiftAmt, MVT::i32)); 10202 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 10203 // Zero out the rightmost bits. 10204 SmallVector<SDValue, 16> V(16, 10205 DAG.getConstant(uint8_t(-1U << ShiftAmt), 10206 MVT::i8)); 10207 return DAG.getNode(ISD::AND, dl, VT, SHL, 10208 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 10209 } 10210 if (Op.getOpcode() == ISD::SRL) { 10211 // Make a large shift. 10212 SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R, 10213 DAG.getConstant(ShiftAmt, MVT::i32)); 10214 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 10215 // Zero out the leftmost bits. 10216 SmallVector<SDValue, 16> V(16, 10217 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 10218 MVT::i8)); 10219 return DAG.getNode(ISD::AND, dl, VT, SRL, 10220 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 10221 } 10222 if (Op.getOpcode() == ISD::SRA) { 10223 if (ShiftAmt == 7) { 10224 // R s>> 7 === R s< 0 10225 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 10226 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 10227 } 10228 10229 // R s>> a === ((R u>> a) ^ m) - m 10230 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 10231 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, 10232 MVT::i8)); 10233 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); 10234 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 10235 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 10236 return Res; 10237 } 10238 } 10239 10240 if (Subtarget->hasAVX2() && VT == MVT::v32i8) { 10241 if (Op.getOpcode() == ISD::SHL) { 10242 // Make a large shift. 10243 SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, 10244 DAG.getConstant(ShiftAmt, MVT::i32)); 10245 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 10246 // Zero out the rightmost bits. 10247 SmallVector<SDValue, 32> V(32, 10248 DAG.getConstant(uint8_t(-1U << ShiftAmt), 10249 MVT::i8)); 10250 return DAG.getNode(ISD::AND, dl, VT, SHL, 10251 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 10252 } 10253 if (Op.getOpcode() == ISD::SRL) { 10254 // Make a large shift. 10255 SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R, 10256 DAG.getConstant(ShiftAmt, MVT::i32)); 10257 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 10258 // Zero out the leftmost bits. 10259 SmallVector<SDValue, 32> V(32, 10260 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 10261 MVT::i8)); 10262 return DAG.getNode(ISD::AND, dl, VT, SRL, 10263 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 10264 } 10265 if (Op.getOpcode() == ISD::SRA) { 10266 if (ShiftAmt == 7) { 10267 // R s>> 7 === R s< 0 10268 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 10269 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 10270 } 10271 10272 // R s>> a === ((R u>> a) ^ m) - m 10273 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 10274 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, 10275 MVT::i8)); 10276 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); 10277 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 10278 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 10279 return Res; 10280 } 10281 } 10282 } 10283 } 10284 10285 // Lower SHL with variable shift amount. 10286 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 10287 Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1), 10288 DAG.getConstant(23, MVT::i32)); 10289 10290 const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U}; 10291 Constant *C = ConstantDataVector::get(*Context, CV); 10292 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 10293 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 10294 MachinePointerInfo::getConstantPool(), 10295 false, false, false, 16); 10296 10297 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 10298 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 10299 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 10300 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 10301 } 10302 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 10303 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); 10304 10305 // a = a << 5; 10306 Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1), 10307 DAG.getConstant(5, MVT::i32)); 10308 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); 10309 10310 // Turn 'a' into a mask suitable for VSELECT 10311 SDValue VSelM = DAG.getConstant(0x80, VT); 10312 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10313 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 10314 10315 SDValue CM1 = DAG.getConstant(0x0f, VT); 10316 SDValue CM2 = DAG.getConstant(0x3f, VT); 10317 10318 // r = VSELECT(r, psllw(r & (char16)15, 4), a); 10319 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); 10320 M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 10321 DAG.getConstant(4, MVT::i32), DAG); 10322 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 10323 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 10324 10325 // a += a 10326 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 10327 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10328 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 10329 10330 // r = VSELECT(r, psllw(r & (char16)63, 2), a); 10331 M = DAG.getNode(ISD::AND, dl, VT, R, CM2); 10332 M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 10333 DAG.getConstant(2, MVT::i32), DAG); 10334 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 10335 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 10336 10337 // a += a 10338 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 10339 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10340 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 10341 10342 // return VSELECT(r, r+r, a); 10343 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, 10344 DAG.getNode(ISD::ADD, dl, VT, R, R), R); 10345 return R; 10346 } 10347 10348 // Decompose 256-bit shifts into smaller 128-bit shifts. 10349 if (VT.getSizeInBits() == 256) { 10350 unsigned NumElems = VT.getVectorNumElements(); 10351 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10352 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 10353 10354 // Extract the two vectors 10355 SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); 10356 SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), 10357 DAG, dl); 10358 10359 // Recreate the shift amount vectors 10360 SDValue Amt1, Amt2; 10361 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 10362 // Constant shift amount 10363 SmallVector<SDValue, 4> Amt1Csts; 10364 SmallVector<SDValue, 4> Amt2Csts; 10365 for (unsigned i = 0; i != NumElems/2; ++i) 10366 Amt1Csts.push_back(Amt->getOperand(i)); 10367 for (unsigned i = NumElems/2; i != NumElems; ++i) 10368 Amt2Csts.push_back(Amt->getOperand(i)); 10369 10370 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 10371 &Amt1Csts[0], NumElems/2); 10372 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 10373 &Amt2Csts[0], NumElems/2); 10374 } else { 10375 // Variable shift amount 10376 Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); 10377 Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), 10378 DAG, dl); 10379 } 10380 10381 // Issue new vector shifts for the smaller types 10382 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 10383 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 10384 10385 // Concatenate the result back 10386 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 10387 } 10388 10389 return SDValue(); 10390} 10391 10392SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 10393 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 10394 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 10395 // looks for this combo and may remove the "setcc" instruction if the "setcc" 10396 // has only one use. 10397 SDNode *N = Op.getNode(); 10398 SDValue LHS = N->getOperand(0); 10399 SDValue RHS = N->getOperand(1); 10400 unsigned BaseOp = 0; 10401 unsigned Cond = 0; 10402 DebugLoc DL = Op.getDebugLoc(); 10403 switch (Op.getOpcode()) { 10404 default: llvm_unreachable("Unknown ovf instruction!"); 10405 case ISD::SADDO: 10406 // A subtract of one will be selected as a INC. Note that INC doesn't 10407 // set CF, so we can't do this for UADDO. 10408 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 10409 if (C->isOne()) { 10410 BaseOp = X86ISD::INC; 10411 Cond = X86::COND_O; 10412 break; 10413 } 10414 BaseOp = X86ISD::ADD; 10415 Cond = X86::COND_O; 10416 break; 10417 case ISD::UADDO: 10418 BaseOp = X86ISD::ADD; 10419 Cond = X86::COND_B; 10420 break; 10421 case ISD::SSUBO: 10422 // A subtract of one will be selected as a DEC. Note that DEC doesn't 10423 // set CF, so we can't do this for USUBO. 10424 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 10425 if (C->isOne()) { 10426 BaseOp = X86ISD::DEC; 10427 Cond = X86::COND_O; 10428 break; 10429 } 10430 BaseOp = X86ISD::SUB; 10431 Cond = X86::COND_O; 10432 break; 10433 case ISD::USUBO: 10434 BaseOp = X86ISD::SUB; 10435 Cond = X86::COND_B; 10436 break; 10437 case ISD::SMULO: 10438 BaseOp = X86ISD::SMUL; 10439 Cond = X86::COND_O; 10440 break; 10441 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 10442 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 10443 MVT::i32); 10444 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 10445 10446 SDValue SetCC = 10447 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10448 DAG.getConstant(X86::COND_O, MVT::i32), 10449 SDValue(Sum.getNode(), 2)); 10450 10451 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 10452 } 10453 } 10454 10455 // Also sets EFLAGS. 10456 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 10457 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 10458 10459 SDValue SetCC = 10460 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 10461 DAG.getConstant(Cond, MVT::i32), 10462 SDValue(Sum.getNode(), 1)); 10463 10464 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 10465} 10466 10467SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 10468 SelectionDAG &DAG) const { 10469 DebugLoc dl = Op.getDebugLoc(); 10470 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 10471 EVT VT = Op.getValueType(); 10472 10473 if (!Subtarget->hasSSE2() || !VT.isVector()) 10474 return SDValue(); 10475 10476 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 10477 ExtraVT.getScalarType().getSizeInBits(); 10478 SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); 10479 10480 switch (VT.getSimpleVT().SimpleTy) { 10481 default: return SDValue(); 10482 case MVT::v8i32: 10483 case MVT::v16i16: 10484 if (!Subtarget->hasAVX()) 10485 return SDValue(); 10486 if (!Subtarget->hasAVX2()) { 10487 // needs to be split 10488 int NumElems = VT.getVectorNumElements(); 10489 SDValue Idx0 = DAG.getConstant(0, MVT::i32); 10490 SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); 10491 10492 // Extract the LHS vectors 10493 SDValue LHS = Op.getOperand(0); 10494 SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); 10495 SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); 10496 10497 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10498 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 10499 10500 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 10501 int ExtraNumElems = ExtraVT.getVectorNumElements(); 10502 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, 10503 ExtraNumElems/2); 10504 SDValue Extra = DAG.getValueType(ExtraVT); 10505 10506 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); 10507 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); 10508 10509 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; 10510 } 10511 // fall through 10512 case MVT::v4i32: 10513 case MVT::v8i16: { 10514 SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, 10515 Op.getOperand(0), ShAmt, DAG); 10516 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); 10517 } 10518 } 10519} 10520 10521 10522SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 10523 DebugLoc dl = Op.getDebugLoc(); 10524 10525 // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. 10526 // There isn't any reason to disable it if the target processor supports it. 10527 if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { 10528 SDValue Chain = Op.getOperand(0); 10529 SDValue Zero = DAG.getConstant(0, MVT::i32); 10530 SDValue Ops[] = { 10531 DAG.getRegister(X86::ESP, MVT::i32), // Base 10532 DAG.getTargetConstant(1, MVT::i8), // Scale 10533 DAG.getRegister(0, MVT::i32), // Index 10534 DAG.getTargetConstant(0, MVT::i32), // Disp 10535 DAG.getRegister(0, MVT::i32), // Segment. 10536 Zero, 10537 Chain 10538 }; 10539 SDNode *Res = 10540 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 10541 array_lengthof(Ops)); 10542 return SDValue(Res, 0); 10543 } 10544 10545 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 10546 if (!isDev) 10547 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 10548 10549 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10550 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 10551 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 10552 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 10553 10554 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 10555 if (!Op1 && !Op2 && !Op3 && Op4) 10556 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 10557 10558 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 10559 if (Op1 && !Op2 && !Op3 && !Op4) 10560 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 10561 10562 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 10563 // (MFENCE)>; 10564 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 10565} 10566 10567SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, 10568 SelectionDAG &DAG) const { 10569 DebugLoc dl = Op.getDebugLoc(); 10570 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 10571 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 10572 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 10573 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 10574 10575 // The only fence that needs an instruction is a sequentially-consistent 10576 // cross-thread fence. 10577 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 10578 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 10579 // no-sse2). There isn't any reason to disable it if the target processor 10580 // supports it. 10581 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 10582 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 10583 10584 SDValue Chain = Op.getOperand(0); 10585 SDValue Zero = DAG.getConstant(0, MVT::i32); 10586 SDValue Ops[] = { 10587 DAG.getRegister(X86::ESP, MVT::i32), // Base 10588 DAG.getTargetConstant(1, MVT::i8), // Scale 10589 DAG.getRegister(0, MVT::i32), // Index 10590 DAG.getTargetConstant(0, MVT::i32), // Disp 10591 DAG.getRegister(0, MVT::i32), // Segment. 10592 Zero, 10593 Chain 10594 }; 10595 SDNode *Res = 10596 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 10597 array_lengthof(Ops)); 10598 return SDValue(Res, 0); 10599 } 10600 10601 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 10602 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 10603} 10604 10605 10606SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 10607 EVT T = Op.getValueType(); 10608 DebugLoc DL = Op.getDebugLoc(); 10609 unsigned Reg = 0; 10610 unsigned size = 0; 10611 switch(T.getSimpleVT().SimpleTy) { 10612 default: llvm_unreachable("Invalid value type!"); 10613 case MVT::i8: Reg = X86::AL; size = 1; break; 10614 case MVT::i16: Reg = X86::AX; size = 2; break; 10615 case MVT::i32: Reg = X86::EAX; size = 4; break; 10616 case MVT::i64: 10617 assert(Subtarget->is64Bit() && "Node not type legal!"); 10618 Reg = X86::RAX; size = 8; 10619 break; 10620 } 10621 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 10622 Op.getOperand(2), SDValue()); 10623 SDValue Ops[] = { cpIn.getValue(0), 10624 Op.getOperand(1), 10625 Op.getOperand(3), 10626 DAG.getTargetConstant(size, MVT::i8), 10627 cpIn.getValue(1) }; 10628 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10629 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 10630 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 10631 Ops, 5, T, MMO); 10632 SDValue cpOut = 10633 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 10634 return cpOut; 10635} 10636 10637SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 10638 SelectionDAG &DAG) const { 10639 assert(Subtarget->is64Bit() && "Result not type legalized?"); 10640 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10641 SDValue TheChain = Op.getOperand(0); 10642 DebugLoc dl = Op.getDebugLoc(); 10643 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 10644 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 10645 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 10646 rax.getValue(2)); 10647 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 10648 DAG.getConstant(32, MVT::i8)); 10649 SDValue Ops[] = { 10650 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 10651 rdx.getValue(1) 10652 }; 10653 return DAG.getMergeValues(Ops, 2, dl); 10654} 10655 10656SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 10657 SelectionDAG &DAG) const { 10658 EVT SrcVT = Op.getOperand(0).getValueType(); 10659 EVT DstVT = Op.getValueType(); 10660 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 10661 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 10662 assert((DstVT == MVT::i64 || 10663 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 10664 "Unexpected custom BITCAST"); 10665 // i64 <=> MMX conversions are Legal. 10666 if (SrcVT==MVT::i64 && DstVT.isVector()) 10667 return Op; 10668 if (DstVT==MVT::i64 && SrcVT.isVector()) 10669 return Op; 10670 // MMX <=> MMX conversions are Legal. 10671 if (SrcVT.isVector() && DstVT.isVector()) 10672 return Op; 10673 // All other conversions need to be expanded. 10674 return SDValue(); 10675} 10676 10677SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 10678 SDNode *Node = Op.getNode(); 10679 DebugLoc dl = Node->getDebugLoc(); 10680 EVT T = Node->getValueType(0); 10681 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 10682 DAG.getConstant(0, T), Node->getOperand(2)); 10683 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 10684 cast<AtomicSDNode>(Node)->getMemoryVT(), 10685 Node->getOperand(0), 10686 Node->getOperand(1), negOp, 10687 cast<AtomicSDNode>(Node)->getSrcValue(), 10688 cast<AtomicSDNode>(Node)->getAlignment(), 10689 cast<AtomicSDNode>(Node)->getOrdering(), 10690 cast<AtomicSDNode>(Node)->getSynchScope()); 10691} 10692 10693static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 10694 SDNode *Node = Op.getNode(); 10695 DebugLoc dl = Node->getDebugLoc(); 10696 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 10697 10698 // Convert seq_cst store -> xchg 10699 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 10700 // FIXME: On 32-bit, store -> fist or movq would be more efficient 10701 // (The only way to get a 16-byte store is cmpxchg16b) 10702 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 10703 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 10704 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 10705 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 10706 cast<AtomicSDNode>(Node)->getMemoryVT(), 10707 Node->getOperand(0), 10708 Node->getOperand(1), Node->getOperand(2), 10709 cast<AtomicSDNode>(Node)->getMemOperand(), 10710 cast<AtomicSDNode>(Node)->getOrdering(), 10711 cast<AtomicSDNode>(Node)->getSynchScope()); 10712 return Swap.getValue(1); 10713 } 10714 // Other atomic stores have a simple pattern. 10715 return Op; 10716} 10717 10718static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 10719 EVT VT = Op.getNode()->getValueType(0); 10720 10721 // Let legalize expand this if it isn't a legal type yet. 10722 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10723 return SDValue(); 10724 10725 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 10726 10727 unsigned Opc; 10728 bool ExtraOp = false; 10729 switch (Op.getOpcode()) { 10730 default: llvm_unreachable("Invalid code"); 10731 case ISD::ADDC: Opc = X86ISD::ADD; break; 10732 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 10733 case ISD::SUBC: Opc = X86ISD::SUB; break; 10734 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 10735 } 10736 10737 if (!ExtraOp) 10738 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 10739 Op.getOperand(1)); 10740 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 10741 Op.getOperand(1), Op.getOperand(2)); 10742} 10743 10744/// LowerOperation - Provide custom lowering hooks for some operations. 10745/// 10746SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 10747 switch (Op.getOpcode()) { 10748 default: llvm_unreachable("Should not custom lower this!"); 10749 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 10750 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 10751 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); 10752 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 10753 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 10754 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 10755 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 10756 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 10757 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 10758 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 10759 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 10760 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 10761 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 10762 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 10763 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 10764 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 10765 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 10766 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 10767 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 10768 case ISD::SHL_PARTS: 10769 case ISD::SRA_PARTS: 10770 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 10771 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 10772 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 10773 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 10774 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 10775 case ISD::FABS: return LowerFABS(Op, DAG); 10776 case ISD::FNEG: return LowerFNEG(Op, DAG); 10777 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 10778 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 10779 case ISD::SETCC: return LowerSETCC(Op, DAG); 10780 case ISD::SELECT: return LowerSELECT(Op, DAG); 10781 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 10782 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 10783 case ISD::VASTART: return LowerVASTART(Op, DAG); 10784 case ISD::VAARG: return LowerVAARG(Op, DAG); 10785 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 10786 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 10787 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 10788 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 10789 case ISD::FRAME_TO_ARGS_OFFSET: 10790 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 10791 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 10792 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 10793 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 10794 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 10795 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 10796 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 10797 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); 10798 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 10799 case ISD::MUL: return LowerMUL(Op, DAG); 10800 case ISD::SRA: 10801 case ISD::SRL: 10802 case ISD::SHL: return LowerShift(Op, DAG); 10803 case ISD::SADDO: 10804 case ISD::UADDO: 10805 case ISD::SSUBO: 10806 case ISD::USUBO: 10807 case ISD::SMULO: 10808 case ISD::UMULO: return LowerXALUO(Op, DAG); 10809 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 10810 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 10811 case ISD::ADDC: 10812 case ISD::ADDE: 10813 case ISD::SUBC: 10814 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 10815 case ISD::ADD: return LowerADD(Op, DAG); 10816 case ISD::SUB: return LowerSUB(Op, DAG); 10817 } 10818} 10819 10820static void ReplaceATOMIC_LOAD(SDNode *Node, 10821 SmallVectorImpl<SDValue> &Results, 10822 SelectionDAG &DAG) { 10823 DebugLoc dl = Node->getDebugLoc(); 10824 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 10825 10826 // Convert wide load -> cmpxchg8b/cmpxchg16b 10827 // FIXME: On 32-bit, load -> fild or movq would be more efficient 10828 // (The only way to get a 16-byte load is cmpxchg16b) 10829 // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. 10830 SDValue Zero = DAG.getConstant(0, VT); 10831 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, 10832 Node->getOperand(0), 10833 Node->getOperand(1), Zero, Zero, 10834 cast<AtomicSDNode>(Node)->getMemOperand(), 10835 cast<AtomicSDNode>(Node)->getOrdering(), 10836 cast<AtomicSDNode>(Node)->getSynchScope()); 10837 Results.push_back(Swap.getValue(0)); 10838 Results.push_back(Swap.getValue(1)); 10839} 10840 10841void X86TargetLowering:: 10842ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 10843 SelectionDAG &DAG, unsigned NewOp) const { 10844 DebugLoc dl = Node->getDebugLoc(); 10845 assert (Node->getValueType(0) == MVT::i64 && 10846 "Only know how to expand i64 atomics"); 10847 10848 SDValue Chain = Node->getOperand(0); 10849 SDValue In1 = Node->getOperand(1); 10850 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10851 Node->getOperand(2), DAG.getIntPtrConstant(0)); 10852 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10853 Node->getOperand(2), DAG.getIntPtrConstant(1)); 10854 SDValue Ops[] = { Chain, In1, In2L, In2H }; 10855 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 10856 SDValue Result = 10857 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 10858 cast<MemSDNode>(Node)->getMemOperand()); 10859 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 10860 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 10861 Results.push_back(Result.getValue(2)); 10862} 10863 10864/// ReplaceNodeResults - Replace a node with an illegal result type 10865/// with a new node built out of custom code. 10866void X86TargetLowering::ReplaceNodeResults(SDNode *N, 10867 SmallVectorImpl<SDValue>&Results, 10868 SelectionDAG &DAG) const { 10869 DebugLoc dl = N->getDebugLoc(); 10870 switch (N->getOpcode()) { 10871 default: 10872 llvm_unreachable("Do not know how to custom type legalize this operation!"); 10873 case ISD::SIGN_EXTEND_INREG: 10874 case ISD::ADDC: 10875 case ISD::ADDE: 10876 case ISD::SUBC: 10877 case ISD::SUBE: 10878 // We don't want to expand or promote these. 10879 return; 10880 case ISD::FP_TO_SINT: 10881 case ISD::FP_TO_UINT: { 10882 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 10883 10884 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) 10885 return; 10886 10887 std::pair<SDValue,SDValue> Vals = 10888 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 10889 SDValue FIST = Vals.first, StackSlot = Vals.second; 10890 if (FIST.getNode() != 0) { 10891 EVT VT = N->getValueType(0); 10892 // Return a load from the stack slot. 10893 if (StackSlot.getNode() != 0) 10894 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 10895 MachinePointerInfo(), 10896 false, false, false, 0)); 10897 else 10898 Results.push_back(FIST); 10899 } 10900 return; 10901 } 10902 case ISD::READCYCLECOUNTER: { 10903 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10904 SDValue TheChain = N->getOperand(0); 10905 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 10906 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 10907 rd.getValue(1)); 10908 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 10909 eax.getValue(2)); 10910 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 10911 SDValue Ops[] = { eax, edx }; 10912 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 10913 Results.push_back(edx.getValue(1)); 10914 return; 10915 } 10916 case ISD::ATOMIC_CMP_SWAP: { 10917 EVT T = N->getValueType(0); 10918 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 10919 bool Regs64bit = T == MVT::i128; 10920 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 10921 SDValue cpInL, cpInH; 10922 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 10923 DAG.getConstant(0, HalfT)); 10924 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 10925 DAG.getConstant(1, HalfT)); 10926 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 10927 Regs64bit ? X86::RAX : X86::EAX, 10928 cpInL, SDValue()); 10929 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 10930 Regs64bit ? X86::RDX : X86::EDX, 10931 cpInH, cpInL.getValue(1)); 10932 SDValue swapInL, swapInH; 10933 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 10934 DAG.getConstant(0, HalfT)); 10935 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 10936 DAG.getConstant(1, HalfT)); 10937 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 10938 Regs64bit ? X86::RBX : X86::EBX, 10939 swapInL, cpInH.getValue(1)); 10940 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 10941 Regs64bit ? X86::RCX : X86::ECX, 10942 swapInH, swapInL.getValue(1)); 10943 SDValue Ops[] = { swapInH.getValue(0), 10944 N->getOperand(1), 10945 swapInH.getValue(1) }; 10946 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10947 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 10948 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 10949 X86ISD::LCMPXCHG8_DAG; 10950 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, 10951 Ops, 3, T, MMO); 10952 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 10953 Regs64bit ? X86::RAX : X86::EAX, 10954 HalfT, Result.getValue(1)); 10955 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 10956 Regs64bit ? X86::RDX : X86::EDX, 10957 HalfT, cpOutL.getValue(2)); 10958 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 10959 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); 10960 Results.push_back(cpOutH.getValue(1)); 10961 return; 10962 } 10963 case ISD::ATOMIC_LOAD_ADD: 10964 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 10965 return; 10966 case ISD::ATOMIC_LOAD_AND: 10967 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 10968 return; 10969 case ISD::ATOMIC_LOAD_NAND: 10970 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 10971 return; 10972 case ISD::ATOMIC_LOAD_OR: 10973 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 10974 return; 10975 case ISD::ATOMIC_LOAD_SUB: 10976 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 10977 return; 10978 case ISD::ATOMIC_LOAD_XOR: 10979 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 10980 return; 10981 case ISD::ATOMIC_SWAP: 10982 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 10983 return; 10984 case ISD::ATOMIC_LOAD: 10985 ReplaceATOMIC_LOAD(N, Results, DAG); 10986 } 10987} 10988 10989const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 10990 switch (Opcode) { 10991 default: return NULL; 10992 case X86ISD::BSF: return "X86ISD::BSF"; 10993 case X86ISD::BSR: return "X86ISD::BSR"; 10994 case X86ISD::SHLD: return "X86ISD::SHLD"; 10995 case X86ISD::SHRD: return "X86ISD::SHRD"; 10996 case X86ISD::FAND: return "X86ISD::FAND"; 10997 case X86ISD::FOR: return "X86ISD::FOR"; 10998 case X86ISD::FXOR: return "X86ISD::FXOR"; 10999 case X86ISD::FSRL: return "X86ISD::FSRL"; 11000 case X86ISD::FILD: return "X86ISD::FILD"; 11001 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 11002 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 11003 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 11004 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 11005 case X86ISD::FLD: return "X86ISD::FLD"; 11006 case X86ISD::FST: return "X86ISD::FST"; 11007 case X86ISD::CALL: return "X86ISD::CALL"; 11008 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 11009 case X86ISD::BT: return "X86ISD::BT"; 11010 case X86ISD::CMP: return "X86ISD::CMP"; 11011 case X86ISD::COMI: return "X86ISD::COMI"; 11012 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 11013 case X86ISD::SETCC: return "X86ISD::SETCC"; 11014 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 11015 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 11016 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 11017 case X86ISD::CMOV: return "X86ISD::CMOV"; 11018 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 11019 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 11020 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 11021 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 11022 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 11023 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 11024 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 11025 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 11026 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 11027 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 11028 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 11029 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 11030 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 11031 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 11032 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 11033 case X86ISD::BLENDV: return "X86ISD::BLENDV"; 11034 case X86ISD::HADD: return "X86ISD::HADD"; 11035 case X86ISD::HSUB: return "X86ISD::HSUB"; 11036 case X86ISD::FHADD: return "X86ISD::FHADD"; 11037 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 11038 case X86ISD::FMAX: return "X86ISD::FMAX"; 11039 case X86ISD::FMIN: return "X86ISD::FMIN"; 11040 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 11041 case X86ISD::FRCP: return "X86ISD::FRCP"; 11042 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 11043 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 11044 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 11045 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 11046 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 11047 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 11048 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 11049 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 11050 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 11051 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 11052 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 11053 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 11054 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 11055 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 11056 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 11057 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 11058 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 11059 case X86ISD::VSHL: return "X86ISD::VSHL"; 11060 case X86ISD::VSRL: return "X86ISD::VSRL"; 11061 case X86ISD::VSRA: return "X86ISD::VSRA"; 11062 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 11063 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 11064 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 11065 case X86ISD::CMPP: return "X86ISD::CMPP"; 11066 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 11067 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 11068 case X86ISD::ADD: return "X86ISD::ADD"; 11069 case X86ISD::SUB: return "X86ISD::SUB"; 11070 case X86ISD::ADC: return "X86ISD::ADC"; 11071 case X86ISD::SBB: return "X86ISD::SBB"; 11072 case X86ISD::SMUL: return "X86ISD::SMUL"; 11073 case X86ISD::UMUL: return "X86ISD::UMUL"; 11074 case X86ISD::INC: return "X86ISD::INC"; 11075 case X86ISD::DEC: return "X86ISD::DEC"; 11076 case X86ISD::OR: return "X86ISD::OR"; 11077 case X86ISD::XOR: return "X86ISD::XOR"; 11078 case X86ISD::AND: return "X86ISD::AND"; 11079 case X86ISD::ANDN: return "X86ISD::ANDN"; 11080 case X86ISD::BLSI: return "X86ISD::BLSI"; 11081 case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; 11082 case X86ISD::BLSR: return "X86ISD::BLSR"; 11083 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 11084 case X86ISD::PTEST: return "X86ISD::PTEST"; 11085 case X86ISD::TESTP: return "X86ISD::TESTP"; 11086 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 11087 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 11088 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 11089 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 11090 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 11091 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 11092 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 11093 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 11094 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 11095 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 11096 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 11097 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 11098 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 11099 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 11100 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 11101 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 11102 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 11103 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 11104 case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; 11105 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 11106 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 11107 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 11108 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 11109 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 11110 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 11111 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 11112 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; 11113 } 11114} 11115 11116// isLegalAddressingMode - Return true if the addressing mode represented 11117// by AM is legal for this target, for a load/store of the specified type. 11118bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 11119 Type *Ty) const { 11120 // X86 supports extremely general addressing modes. 11121 CodeModel::Model M = getTargetMachine().getCodeModel(); 11122 Reloc::Model R = getTargetMachine().getRelocationModel(); 11123 11124 // X86 allows a sign-extended 32-bit immediate field as a displacement. 11125 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 11126 return false; 11127 11128 if (AM.BaseGV) { 11129 unsigned GVFlags = 11130 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 11131 11132 // If a reference to this global requires an extra load, we can't fold it. 11133 if (isGlobalStubReference(GVFlags)) 11134 return false; 11135 11136 // If BaseGV requires a register for the PIC base, we cannot also have a 11137 // BaseReg specified. 11138 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 11139 return false; 11140 11141 // If lower 4G is not available, then we must use rip-relative addressing. 11142 if ((M != CodeModel::Small || R != Reloc::Static) && 11143 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 11144 return false; 11145 } 11146 11147 switch (AM.Scale) { 11148 case 0: 11149 case 1: 11150 case 2: 11151 case 4: 11152 case 8: 11153 // These scales always work. 11154 break; 11155 case 3: 11156 case 5: 11157 case 9: 11158 // These scales are formed with basereg+scalereg. Only accept if there is 11159 // no basereg yet. 11160 if (AM.HasBaseReg) 11161 return false; 11162 break; 11163 default: // Other stuff never works. 11164 return false; 11165 } 11166 11167 return true; 11168} 11169 11170 11171bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 11172 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 11173 return false; 11174 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 11175 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 11176 if (NumBits1 <= NumBits2) 11177 return false; 11178 return true; 11179} 11180 11181bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 11182 if (!VT1.isInteger() || !VT2.isInteger()) 11183 return false; 11184 unsigned NumBits1 = VT1.getSizeInBits(); 11185 unsigned NumBits2 = VT2.getSizeInBits(); 11186 if (NumBits1 <= NumBits2) 11187 return false; 11188 return true; 11189} 11190 11191bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 11192 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 11193 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 11194} 11195 11196bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 11197 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 11198 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 11199} 11200 11201bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 11202 // i16 instructions are longer (0x66 prefix) and potentially slower. 11203 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 11204} 11205 11206/// isShuffleMaskLegal - Targets can use this to indicate that they only 11207/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 11208/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 11209/// are assumed to be legal. 11210bool 11211X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 11212 EVT VT) const { 11213 // Very little shuffling can be done for 64-bit vectors right now. 11214 if (VT.getSizeInBits() == 64) 11215 return false; 11216 11217 // FIXME: pshufb, blends, shifts. 11218 return (VT.getVectorNumElements() == 2 || 11219 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 11220 isMOVLMask(M, VT) || 11221 isSHUFPMask(M, VT, Subtarget->hasAVX()) || 11222 isPSHUFDMask(M, VT) || 11223 isPSHUFHWMask(M, VT) || 11224 isPSHUFLWMask(M, VT) || 11225 isPALIGNRMask(M, VT, Subtarget) || 11226 isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || 11227 isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || 11228 isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) || 11229 isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2())); 11230} 11231 11232bool 11233X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 11234 EVT VT) const { 11235 unsigned NumElts = VT.getVectorNumElements(); 11236 // FIXME: This collection of masks seems suspect. 11237 if (NumElts == 2) 11238 return true; 11239 if (NumElts == 4 && VT.getSizeInBits() == 128) { 11240 return (isMOVLMask(Mask, VT) || 11241 isCommutedMOVLMask(Mask, VT, true) || 11242 isSHUFPMask(Mask, VT, Subtarget->hasAVX()) || 11243 isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true)); 11244 } 11245 return false; 11246} 11247 11248//===----------------------------------------------------------------------===// 11249// X86 Scheduler Hooks 11250//===----------------------------------------------------------------------===// 11251 11252// private utility function 11253MachineBasicBlock * 11254X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 11255 MachineBasicBlock *MBB, 11256 unsigned regOpc, 11257 unsigned immOpc, 11258 unsigned LoadOpc, 11259 unsigned CXchgOpc, 11260 unsigned notOpc, 11261 unsigned EAXreg, 11262 const TargetRegisterClass *RC, 11263 bool invSrc) const { 11264 // For the atomic bitwise operator, we generate 11265 // thisMBB: 11266 // newMBB: 11267 // ld t1 = [bitinstr.addr] 11268 // op t2 = t1, [bitinstr.val] 11269 // mov EAX = t1 11270 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 11271 // bz newMBB 11272 // fallthrough -->nextMBB 11273 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11274 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11275 MachineFunction::iterator MBBIter = MBB; 11276 ++MBBIter; 11277 11278 /// First build the CFG 11279 MachineFunction *F = MBB->getParent(); 11280 MachineBasicBlock *thisMBB = MBB; 11281 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 11282 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 11283 F->insert(MBBIter, newMBB); 11284 F->insert(MBBIter, nextMBB); 11285 11286 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 11287 nextMBB->splice(nextMBB->begin(), thisMBB, 11288 llvm::next(MachineBasicBlock::iterator(bInstr)), 11289 thisMBB->end()); 11290 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11291 11292 // Update thisMBB to fall through to newMBB 11293 thisMBB->addSuccessor(newMBB); 11294 11295 // newMBB jumps to itself and fall through to nextMBB 11296 newMBB->addSuccessor(nextMBB); 11297 newMBB->addSuccessor(newMBB); 11298 11299 // Insert instructions into newMBB based on incoming instruction 11300 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 11301 "unexpected number of operands"); 11302 DebugLoc dl = bInstr->getDebugLoc(); 11303 MachineOperand& destOper = bInstr->getOperand(0); 11304 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 11305 int numArgs = bInstr->getNumOperands() - 1; 11306 for (int i=0; i < numArgs; ++i) 11307 argOpers[i] = &bInstr->getOperand(i+1); 11308 11309 // x86 address has 4 operands: base, index, scale, and displacement 11310 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 11311 int valArgIndx = lastAddrIndx + 1; 11312 11313 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 11314 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 11315 for (int i=0; i <= lastAddrIndx; ++i) 11316 (*MIB).addOperand(*argOpers[i]); 11317 11318 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 11319 if (invSrc) { 11320 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 11321 } 11322 else 11323 tt = t1; 11324 11325 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 11326 assert((argOpers[valArgIndx]->isReg() || 11327 argOpers[valArgIndx]->isImm()) && 11328 "invalid operand"); 11329 if (argOpers[valArgIndx]->isReg()) 11330 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 11331 else 11332 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 11333 MIB.addReg(tt); 11334 (*MIB).addOperand(*argOpers[valArgIndx]); 11335 11336 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 11337 MIB.addReg(t1); 11338 11339 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 11340 for (int i=0; i <= lastAddrIndx; ++i) 11341 (*MIB).addOperand(*argOpers[i]); 11342 MIB.addReg(t2); 11343 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 11344 (*MIB).setMemRefs(bInstr->memoperands_begin(), 11345 bInstr->memoperands_end()); 11346 11347 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 11348 MIB.addReg(EAXreg); 11349 11350 // insert branch 11351 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 11352 11353 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 11354 return nextMBB; 11355} 11356 11357// private utility function: 64 bit atomics on 32 bit host. 11358MachineBasicBlock * 11359X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 11360 MachineBasicBlock *MBB, 11361 unsigned regOpcL, 11362 unsigned regOpcH, 11363 unsigned immOpcL, 11364 unsigned immOpcH, 11365 bool invSrc) const { 11366 // For the atomic bitwise operator, we generate 11367 // thisMBB (instructions are in pairs, except cmpxchg8b) 11368 // ld t1,t2 = [bitinstr.addr] 11369 // newMBB: 11370 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 11371 // op t5, t6 <- out1, out2, [bitinstr.val] 11372 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 11373 // mov ECX, EBX <- t5, t6 11374 // mov EAX, EDX <- t1, t2 11375 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 11376 // mov t3, t4 <- EAX, EDX 11377 // bz newMBB 11378 // result in out1, out2 11379 // fallthrough -->nextMBB 11380 11381 const TargetRegisterClass *RC = X86::GR32RegisterClass; 11382 const unsigned LoadOpc = X86::MOV32rm; 11383 const unsigned NotOpc = X86::NOT32r; 11384 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11385 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11386 MachineFunction::iterator MBBIter = MBB; 11387 ++MBBIter; 11388 11389 /// First build the CFG 11390 MachineFunction *F = MBB->getParent(); 11391 MachineBasicBlock *thisMBB = MBB; 11392 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 11393 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 11394 F->insert(MBBIter, newMBB); 11395 F->insert(MBBIter, nextMBB); 11396 11397 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 11398 nextMBB->splice(nextMBB->begin(), thisMBB, 11399 llvm::next(MachineBasicBlock::iterator(bInstr)), 11400 thisMBB->end()); 11401 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11402 11403 // Update thisMBB to fall through to newMBB 11404 thisMBB->addSuccessor(newMBB); 11405 11406 // newMBB jumps to itself and fall through to nextMBB 11407 newMBB->addSuccessor(nextMBB); 11408 newMBB->addSuccessor(newMBB); 11409 11410 DebugLoc dl = bInstr->getDebugLoc(); 11411 // Insert instructions into newMBB based on incoming instruction 11412 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 11413 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 11414 "unexpected number of operands"); 11415 MachineOperand& dest1Oper = bInstr->getOperand(0); 11416 MachineOperand& dest2Oper = bInstr->getOperand(1); 11417 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 11418 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 11419 argOpers[i] = &bInstr->getOperand(i+2); 11420 11421 // We use some of the operands multiple times, so conservatively just 11422 // clear any kill flags that might be present. 11423 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 11424 argOpers[i]->setIsKill(false); 11425 } 11426 11427 // x86 address has 5 operands: base, index, scale, displacement, and segment. 11428 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 11429 11430 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 11431 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 11432 for (int i=0; i <= lastAddrIndx; ++i) 11433 (*MIB).addOperand(*argOpers[i]); 11434 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 11435 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 11436 // add 4 to displacement. 11437 for (int i=0; i <= lastAddrIndx-2; ++i) 11438 (*MIB).addOperand(*argOpers[i]); 11439 MachineOperand newOp3 = *(argOpers[3]); 11440 if (newOp3.isImm()) 11441 newOp3.setImm(newOp3.getImm()+4); 11442 else 11443 newOp3.setOffset(newOp3.getOffset()+4); 11444 (*MIB).addOperand(newOp3); 11445 (*MIB).addOperand(*argOpers[lastAddrIndx]); 11446 11447 // t3/4 are defined later, at the bottom of the loop 11448 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 11449 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 11450 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 11451 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 11452 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 11453 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 11454 11455 // The subsequent operations should be using the destination registers of 11456 //the PHI instructions. 11457 if (invSrc) { 11458 t1 = F->getRegInfo().createVirtualRegister(RC); 11459 t2 = F->getRegInfo().createVirtualRegister(RC); 11460 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 11461 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 11462 } else { 11463 t1 = dest1Oper.getReg(); 11464 t2 = dest2Oper.getReg(); 11465 } 11466 11467 int valArgIndx = lastAddrIndx + 1; 11468 assert((argOpers[valArgIndx]->isReg() || 11469 argOpers[valArgIndx]->isImm()) && 11470 "invalid operand"); 11471 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 11472 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 11473 if (argOpers[valArgIndx]->isReg()) 11474 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 11475 else 11476 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 11477 if (regOpcL != X86::MOV32rr) 11478 MIB.addReg(t1); 11479 (*MIB).addOperand(*argOpers[valArgIndx]); 11480 assert(argOpers[valArgIndx + 1]->isReg() == 11481 argOpers[valArgIndx]->isReg()); 11482 assert(argOpers[valArgIndx + 1]->isImm() == 11483 argOpers[valArgIndx]->isImm()); 11484 if (argOpers[valArgIndx + 1]->isReg()) 11485 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 11486 else 11487 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 11488 if (regOpcH != X86::MOV32rr) 11489 MIB.addReg(t2); 11490 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 11491 11492 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 11493 MIB.addReg(t1); 11494 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 11495 MIB.addReg(t2); 11496 11497 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 11498 MIB.addReg(t5); 11499 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 11500 MIB.addReg(t6); 11501 11502 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 11503 for (int i=0; i <= lastAddrIndx; ++i) 11504 (*MIB).addOperand(*argOpers[i]); 11505 11506 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 11507 (*MIB).setMemRefs(bInstr->memoperands_begin(), 11508 bInstr->memoperands_end()); 11509 11510 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 11511 MIB.addReg(X86::EAX); 11512 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 11513 MIB.addReg(X86::EDX); 11514 11515 // insert branch 11516 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 11517 11518 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 11519 return nextMBB; 11520} 11521 11522// private utility function 11523MachineBasicBlock * 11524X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 11525 MachineBasicBlock *MBB, 11526 unsigned cmovOpc) const { 11527 // For the atomic min/max operator, we generate 11528 // thisMBB: 11529 // newMBB: 11530 // ld t1 = [min/max.addr] 11531 // mov t2 = [min/max.val] 11532 // cmp t1, t2 11533 // cmov[cond] t2 = t1 11534 // mov EAX = t1 11535 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 11536 // bz newMBB 11537 // fallthrough -->nextMBB 11538 // 11539 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11540 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11541 MachineFunction::iterator MBBIter = MBB; 11542 ++MBBIter; 11543 11544 /// First build the CFG 11545 MachineFunction *F = MBB->getParent(); 11546 MachineBasicBlock *thisMBB = MBB; 11547 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 11548 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 11549 F->insert(MBBIter, newMBB); 11550 F->insert(MBBIter, nextMBB); 11551 11552 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 11553 nextMBB->splice(nextMBB->begin(), thisMBB, 11554 llvm::next(MachineBasicBlock::iterator(mInstr)), 11555 thisMBB->end()); 11556 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11557 11558 // Update thisMBB to fall through to newMBB 11559 thisMBB->addSuccessor(newMBB); 11560 11561 // newMBB jumps to newMBB and fall through to nextMBB 11562 newMBB->addSuccessor(nextMBB); 11563 newMBB->addSuccessor(newMBB); 11564 11565 DebugLoc dl = mInstr->getDebugLoc(); 11566 // Insert instructions into newMBB based on incoming instruction 11567 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 11568 "unexpected number of operands"); 11569 MachineOperand& destOper = mInstr->getOperand(0); 11570 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 11571 int numArgs = mInstr->getNumOperands() - 1; 11572 for (int i=0; i < numArgs; ++i) 11573 argOpers[i] = &mInstr->getOperand(i+1); 11574 11575 // x86 address has 4 operands: base, index, scale, and displacement 11576 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 11577 int valArgIndx = lastAddrIndx + 1; 11578 11579 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 11580 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 11581 for (int i=0; i <= lastAddrIndx; ++i) 11582 (*MIB).addOperand(*argOpers[i]); 11583 11584 // We only support register and immediate values 11585 assert((argOpers[valArgIndx]->isReg() || 11586 argOpers[valArgIndx]->isImm()) && 11587 "invalid operand"); 11588 11589 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 11590 if (argOpers[valArgIndx]->isReg()) 11591 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 11592 else 11593 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 11594 (*MIB).addOperand(*argOpers[valArgIndx]); 11595 11596 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 11597 MIB.addReg(t1); 11598 11599 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 11600 MIB.addReg(t1); 11601 MIB.addReg(t2); 11602 11603 // Generate movc 11604 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 11605 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 11606 MIB.addReg(t2); 11607 MIB.addReg(t1); 11608 11609 // Cmp and exchange if none has modified the memory location 11610 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 11611 for (int i=0; i <= lastAddrIndx; ++i) 11612 (*MIB).addOperand(*argOpers[i]); 11613 MIB.addReg(t3); 11614 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 11615 (*MIB).setMemRefs(mInstr->memoperands_begin(), 11616 mInstr->memoperands_end()); 11617 11618 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 11619 MIB.addReg(X86::EAX); 11620 11621 // insert branch 11622 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 11623 11624 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 11625 return nextMBB; 11626} 11627 11628// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 11629// or XMM0_V32I8 in AVX all of this code can be replaced with that 11630// in the .td file. 11631MachineBasicBlock * 11632X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 11633 unsigned numArgs, bool memArg) const { 11634 assert(Subtarget->hasSSE42() && 11635 "Target must have SSE4.2 or AVX features enabled"); 11636 11637 DebugLoc dl = MI->getDebugLoc(); 11638 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11639 unsigned Opc; 11640 if (!Subtarget->hasAVX()) { 11641 if (memArg) 11642 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 11643 else 11644 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 11645 } else { 11646 if (memArg) 11647 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 11648 else 11649 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 11650 } 11651 11652 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 11653 for (unsigned i = 0; i < numArgs; ++i) { 11654 MachineOperand &Op = MI->getOperand(i+1); 11655 if (!(Op.isReg() && Op.isImplicit())) 11656 MIB.addOperand(Op); 11657 } 11658 BuildMI(*BB, MI, dl, 11659 TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr), 11660 MI->getOperand(0).getReg()) 11661 .addReg(X86::XMM0); 11662 11663 MI->eraseFromParent(); 11664 return BB; 11665} 11666 11667MachineBasicBlock * 11668X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 11669 DebugLoc dl = MI->getDebugLoc(); 11670 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11671 11672 // Address into RAX/EAX, other two args into ECX, EDX. 11673 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 11674 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 11675 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 11676 for (int i = 0; i < X86::AddrNumOperands; ++i) 11677 MIB.addOperand(MI->getOperand(i)); 11678 11679 unsigned ValOps = X86::AddrNumOperands; 11680 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 11681 .addReg(MI->getOperand(ValOps).getReg()); 11682 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 11683 .addReg(MI->getOperand(ValOps+1).getReg()); 11684 11685 // The instruction doesn't actually take any operands though. 11686 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 11687 11688 MI->eraseFromParent(); // The pseudo is gone now. 11689 return BB; 11690} 11691 11692MachineBasicBlock * 11693X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 11694 DebugLoc dl = MI->getDebugLoc(); 11695 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11696 11697 // First arg in ECX, the second in EAX. 11698 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 11699 .addReg(MI->getOperand(0).getReg()); 11700 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 11701 .addReg(MI->getOperand(1).getReg()); 11702 11703 // The instruction doesn't actually take any operands though. 11704 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 11705 11706 MI->eraseFromParent(); // The pseudo is gone now. 11707 return BB; 11708} 11709 11710MachineBasicBlock * 11711X86TargetLowering::EmitVAARG64WithCustomInserter( 11712 MachineInstr *MI, 11713 MachineBasicBlock *MBB) const { 11714 // Emit va_arg instruction on X86-64. 11715 11716 // Operands to this pseudo-instruction: 11717 // 0 ) Output : destination address (reg) 11718 // 1-5) Input : va_list address (addr, i64mem) 11719 // 6 ) ArgSize : Size (in bytes) of vararg type 11720 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 11721 // 8 ) Align : Alignment of type 11722 // 9 ) EFLAGS (implicit-def) 11723 11724 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 11725 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 11726 11727 unsigned DestReg = MI->getOperand(0).getReg(); 11728 MachineOperand &Base = MI->getOperand(1); 11729 MachineOperand &Scale = MI->getOperand(2); 11730 MachineOperand &Index = MI->getOperand(3); 11731 MachineOperand &Disp = MI->getOperand(4); 11732 MachineOperand &Segment = MI->getOperand(5); 11733 unsigned ArgSize = MI->getOperand(6).getImm(); 11734 unsigned ArgMode = MI->getOperand(7).getImm(); 11735 unsigned Align = MI->getOperand(8).getImm(); 11736 11737 // Memory Reference 11738 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 11739 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 11740 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 11741 11742 // Machine Information 11743 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11744 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 11745 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 11746 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 11747 DebugLoc DL = MI->getDebugLoc(); 11748 11749 // struct va_list { 11750 // i32 gp_offset 11751 // i32 fp_offset 11752 // i64 overflow_area (address) 11753 // i64 reg_save_area (address) 11754 // } 11755 // sizeof(va_list) = 24 11756 // alignment(va_list) = 8 11757 11758 unsigned TotalNumIntRegs = 6; 11759 unsigned TotalNumXMMRegs = 8; 11760 bool UseGPOffset = (ArgMode == 1); 11761 bool UseFPOffset = (ArgMode == 2); 11762 unsigned MaxOffset = TotalNumIntRegs * 8 + 11763 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 11764 11765 /* Align ArgSize to a multiple of 8 */ 11766 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 11767 bool NeedsAlign = (Align > 8); 11768 11769 MachineBasicBlock *thisMBB = MBB; 11770 MachineBasicBlock *overflowMBB; 11771 MachineBasicBlock *offsetMBB; 11772 MachineBasicBlock *endMBB; 11773 11774 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 11775 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 11776 unsigned OffsetReg = 0; 11777 11778 if (!UseGPOffset && !UseFPOffset) { 11779 // If we only pull from the overflow region, we don't create a branch. 11780 // We don't need to alter control flow. 11781 OffsetDestReg = 0; // unused 11782 OverflowDestReg = DestReg; 11783 11784 offsetMBB = NULL; 11785 overflowMBB = thisMBB; 11786 endMBB = thisMBB; 11787 } else { 11788 // First emit code to check if gp_offset (or fp_offset) is below the bound. 11789 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 11790 // If not, pull from overflow_area. (branch to overflowMBB) 11791 // 11792 // thisMBB 11793 // | . 11794 // | . 11795 // offsetMBB overflowMBB 11796 // | . 11797 // | . 11798 // endMBB 11799 11800 // Registers for the PHI in endMBB 11801 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 11802 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 11803 11804 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11805 MachineFunction *MF = MBB->getParent(); 11806 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11807 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11808 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11809 11810 MachineFunction::iterator MBBIter = MBB; 11811 ++MBBIter; 11812 11813 // Insert the new basic blocks 11814 MF->insert(MBBIter, offsetMBB); 11815 MF->insert(MBBIter, overflowMBB); 11816 MF->insert(MBBIter, endMBB); 11817 11818 // Transfer the remainder of MBB and its successor edges to endMBB. 11819 endMBB->splice(endMBB->begin(), thisMBB, 11820 llvm::next(MachineBasicBlock::iterator(MI)), 11821 thisMBB->end()); 11822 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11823 11824 // Make offsetMBB and overflowMBB successors of thisMBB 11825 thisMBB->addSuccessor(offsetMBB); 11826 thisMBB->addSuccessor(overflowMBB); 11827 11828 // endMBB is a successor of both offsetMBB and overflowMBB 11829 offsetMBB->addSuccessor(endMBB); 11830 overflowMBB->addSuccessor(endMBB); 11831 11832 // Load the offset value into a register 11833 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 11834 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 11835 .addOperand(Base) 11836 .addOperand(Scale) 11837 .addOperand(Index) 11838 .addDisp(Disp, UseFPOffset ? 4 : 0) 11839 .addOperand(Segment) 11840 .setMemRefs(MMOBegin, MMOEnd); 11841 11842 // Check if there is enough room left to pull this argument. 11843 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 11844 .addReg(OffsetReg) 11845 .addImm(MaxOffset + 8 - ArgSizeA8); 11846 11847 // Branch to "overflowMBB" if offset >= max 11848 // Fall through to "offsetMBB" otherwise 11849 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 11850 .addMBB(overflowMBB); 11851 } 11852 11853 // In offsetMBB, emit code to use the reg_save_area. 11854 if (offsetMBB) { 11855 assert(OffsetReg != 0); 11856 11857 // Read the reg_save_area address. 11858 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 11859 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 11860 .addOperand(Base) 11861 .addOperand(Scale) 11862 .addOperand(Index) 11863 .addDisp(Disp, 16) 11864 .addOperand(Segment) 11865 .setMemRefs(MMOBegin, MMOEnd); 11866 11867 // Zero-extend the offset 11868 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 11869 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 11870 .addImm(0) 11871 .addReg(OffsetReg) 11872 .addImm(X86::sub_32bit); 11873 11874 // Add the offset to the reg_save_area to get the final address. 11875 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 11876 .addReg(OffsetReg64) 11877 .addReg(RegSaveReg); 11878 11879 // Compute the offset for the next argument 11880 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 11881 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 11882 .addReg(OffsetReg) 11883 .addImm(UseFPOffset ? 16 : 8); 11884 11885 // Store it back into the va_list. 11886 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 11887 .addOperand(Base) 11888 .addOperand(Scale) 11889 .addOperand(Index) 11890 .addDisp(Disp, UseFPOffset ? 4 : 0) 11891 .addOperand(Segment) 11892 .addReg(NextOffsetReg) 11893 .setMemRefs(MMOBegin, MMOEnd); 11894 11895 // Jump to endMBB 11896 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 11897 .addMBB(endMBB); 11898 } 11899 11900 // 11901 // Emit code to use overflow area 11902 // 11903 11904 // Load the overflow_area address into a register. 11905 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 11906 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 11907 .addOperand(Base) 11908 .addOperand(Scale) 11909 .addOperand(Index) 11910 .addDisp(Disp, 8) 11911 .addOperand(Segment) 11912 .setMemRefs(MMOBegin, MMOEnd); 11913 11914 // If we need to align it, do so. Otherwise, just copy the address 11915 // to OverflowDestReg. 11916 if (NeedsAlign) { 11917 // Align the overflow address 11918 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 11919 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 11920 11921 // aligned_addr = (addr + (align-1)) & ~(align-1) 11922 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 11923 .addReg(OverflowAddrReg) 11924 .addImm(Align-1); 11925 11926 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 11927 .addReg(TmpReg) 11928 .addImm(~(uint64_t)(Align-1)); 11929 } else { 11930 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 11931 .addReg(OverflowAddrReg); 11932 } 11933 11934 // Compute the next overflow address after this argument. 11935 // (the overflow address should be kept 8-byte aligned) 11936 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 11937 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 11938 .addReg(OverflowDestReg) 11939 .addImm(ArgSizeA8); 11940 11941 // Store the new overflow address. 11942 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 11943 .addOperand(Base) 11944 .addOperand(Scale) 11945 .addOperand(Index) 11946 .addDisp(Disp, 8) 11947 .addOperand(Segment) 11948 .addReg(NextAddrReg) 11949 .setMemRefs(MMOBegin, MMOEnd); 11950 11951 // If we branched, emit the PHI to the front of endMBB. 11952 if (offsetMBB) { 11953 BuildMI(*endMBB, endMBB->begin(), DL, 11954 TII->get(X86::PHI), DestReg) 11955 .addReg(OffsetDestReg).addMBB(offsetMBB) 11956 .addReg(OverflowDestReg).addMBB(overflowMBB); 11957 } 11958 11959 // Erase the pseudo instruction 11960 MI->eraseFromParent(); 11961 11962 return endMBB; 11963} 11964 11965MachineBasicBlock * 11966X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 11967 MachineInstr *MI, 11968 MachineBasicBlock *MBB) const { 11969 // Emit code to save XMM registers to the stack. The ABI says that the 11970 // number of registers to save is given in %al, so it's theoretically 11971 // possible to do an indirect jump trick to avoid saving all of them, 11972 // however this code takes a simpler approach and just executes all 11973 // of the stores if %al is non-zero. It's less code, and it's probably 11974 // easier on the hardware branch predictor, and stores aren't all that 11975 // expensive anyway. 11976 11977 // Create the new basic blocks. One block contains all the XMM stores, 11978 // and one block is the final destination regardless of whether any 11979 // stores were performed. 11980 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11981 MachineFunction *F = MBB->getParent(); 11982 MachineFunction::iterator MBBIter = MBB; 11983 ++MBBIter; 11984 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 11985 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 11986 F->insert(MBBIter, XMMSaveMBB); 11987 F->insert(MBBIter, EndMBB); 11988 11989 // Transfer the remainder of MBB and its successor edges to EndMBB. 11990 EndMBB->splice(EndMBB->begin(), MBB, 11991 llvm::next(MachineBasicBlock::iterator(MI)), 11992 MBB->end()); 11993 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 11994 11995 // The original block will now fall through to the XMM save block. 11996 MBB->addSuccessor(XMMSaveMBB); 11997 // The XMMSaveMBB will fall through to the end block. 11998 XMMSaveMBB->addSuccessor(EndMBB); 11999 12000 // Now add the instructions. 12001 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12002 DebugLoc DL = MI->getDebugLoc(); 12003 12004 unsigned CountReg = MI->getOperand(0).getReg(); 12005 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 12006 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 12007 12008 if (!Subtarget->isTargetWin64()) { 12009 // If %al is 0, branch around the XMM save block. 12010 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 12011 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 12012 MBB->addSuccessor(EndMBB); 12013 } 12014 12015 unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; 12016 // In the XMM save block, save all the XMM argument registers. 12017 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 12018 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 12019 MachineMemOperand *MMO = 12020 F->getMachineMemOperand( 12021 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 12022 MachineMemOperand::MOStore, 12023 /*Size=*/16, /*Align=*/16); 12024 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 12025 .addFrameIndex(RegSaveFrameIndex) 12026 .addImm(/*Scale=*/1) 12027 .addReg(/*IndexReg=*/0) 12028 .addImm(/*Disp=*/Offset) 12029 .addReg(/*Segment=*/0) 12030 .addReg(MI->getOperand(i).getReg()) 12031 .addMemOperand(MMO); 12032 } 12033 12034 MI->eraseFromParent(); // The pseudo instruction is gone now. 12035 12036 return EndMBB; 12037} 12038 12039// The EFLAGS operand of SelectItr might be missing a kill marker 12040// because there were multiple uses of EFLAGS, and ISel didn't know 12041// which to mark. Figure out whether SelectItr should have had a 12042// kill marker, and set it if it should. Returns the correct kill 12043// marker value. 12044static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 12045 MachineBasicBlock* BB, 12046 const TargetRegisterInfo* TRI) { 12047 // Scan forward through BB for a use/def of EFLAGS. 12048 MachineBasicBlock::iterator miI(llvm::next(SelectItr)); 12049 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 12050 const MachineInstr& mi = *miI; 12051 if (mi.readsRegister(X86::EFLAGS)) 12052 return false; 12053 if (mi.definesRegister(X86::EFLAGS)) 12054 break; // Should have kill-flag - update below. 12055 } 12056 12057 // If we hit the end of the block, check whether EFLAGS is live into a 12058 // successor. 12059 if (miI == BB->end()) { 12060 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 12061 sEnd = BB->succ_end(); 12062 sItr != sEnd; ++sItr) { 12063 MachineBasicBlock* succ = *sItr; 12064 if (succ->isLiveIn(X86::EFLAGS)) 12065 return false; 12066 } 12067 } 12068 12069 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 12070 // out. SelectMI should have a kill flag on EFLAGS. 12071 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 12072 return true; 12073} 12074 12075MachineBasicBlock * 12076X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 12077 MachineBasicBlock *BB) const { 12078 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12079 DebugLoc DL = MI->getDebugLoc(); 12080 12081 // To "insert" a SELECT_CC instruction, we actually have to insert the 12082 // diamond control-flow pattern. The incoming instruction knows the 12083 // destination vreg to set, the condition code register to branch on, the 12084 // true/false values to select between, and a branch opcode to use. 12085 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 12086 MachineFunction::iterator It = BB; 12087 ++It; 12088 12089 // thisMBB: 12090 // ... 12091 // TrueVal = ... 12092 // cmpTY ccX, r1, r2 12093 // bCC copy1MBB 12094 // fallthrough --> copy0MBB 12095 MachineBasicBlock *thisMBB = BB; 12096 MachineFunction *F = BB->getParent(); 12097 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 12098 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 12099 F->insert(It, copy0MBB); 12100 F->insert(It, sinkMBB); 12101 12102 // If the EFLAGS register isn't dead in the terminator, then claim that it's 12103 // live into the sink and copy blocks. 12104 const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); 12105 if (!MI->killsRegister(X86::EFLAGS) && 12106 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { 12107 copy0MBB->addLiveIn(X86::EFLAGS); 12108 sinkMBB->addLiveIn(X86::EFLAGS); 12109 } 12110 12111 // Transfer the remainder of BB and its successor edges to sinkMBB. 12112 sinkMBB->splice(sinkMBB->begin(), BB, 12113 llvm::next(MachineBasicBlock::iterator(MI)), 12114 BB->end()); 12115 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 12116 12117 // Add the true and fallthrough blocks as its successors. 12118 BB->addSuccessor(copy0MBB); 12119 BB->addSuccessor(sinkMBB); 12120 12121 // Create the conditional branch instruction. 12122 unsigned Opc = 12123 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 12124 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 12125 12126 // copy0MBB: 12127 // %FalseValue = ... 12128 // # fallthrough to sinkMBB 12129 copy0MBB->addSuccessor(sinkMBB); 12130 12131 // sinkMBB: 12132 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 12133 // ... 12134 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 12135 TII->get(X86::PHI), MI->getOperand(0).getReg()) 12136 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 12137 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 12138 12139 MI->eraseFromParent(); // The pseudo instruction is gone now. 12140 return sinkMBB; 12141} 12142 12143MachineBasicBlock * 12144X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, 12145 bool Is64Bit) const { 12146 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12147 DebugLoc DL = MI->getDebugLoc(); 12148 MachineFunction *MF = BB->getParent(); 12149 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 12150 12151 assert(getTargetMachine().Options.EnableSegmentedStacks); 12152 12153 unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 12154 unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; 12155 12156 // BB: 12157 // ... [Till the alloca] 12158 // If stacklet is not large enough, jump to mallocMBB 12159 // 12160 // bumpMBB: 12161 // Allocate by subtracting from RSP 12162 // Jump to continueMBB 12163 // 12164 // mallocMBB: 12165 // Allocate by call to runtime 12166 // 12167 // continueMBB: 12168 // ... 12169 // [rest of original BB] 12170 // 12171 12172 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12173 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12174 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12175 12176 MachineRegisterInfo &MRI = MF->getRegInfo(); 12177 const TargetRegisterClass *AddrRegClass = 12178 getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); 12179 12180 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 12181 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 12182 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 12183 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 12184 sizeVReg = MI->getOperand(1).getReg(), 12185 physSPReg = Is64Bit ? X86::RSP : X86::ESP; 12186 12187 MachineFunction::iterator MBBIter = BB; 12188 ++MBBIter; 12189 12190 MF->insert(MBBIter, bumpMBB); 12191 MF->insert(MBBIter, mallocMBB); 12192 MF->insert(MBBIter, continueMBB); 12193 12194 continueMBB->splice(continueMBB->begin(), BB, llvm::next 12195 (MachineBasicBlock::iterator(MI)), BB->end()); 12196 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 12197 12198 // Add code to the main basic block to check if the stack limit has been hit, 12199 // and if so, jump to mallocMBB otherwise to bumpMBB. 12200 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 12201 BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 12202 .addReg(tmpSPVReg).addReg(sizeVReg); 12203 BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) 12204 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 12205 .addReg(SPLimitVReg); 12206 BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); 12207 12208 // bumpMBB simply decreases the stack pointer, since we know the current 12209 // stacklet has enough space. 12210 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 12211 .addReg(SPLimitVReg); 12212 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 12213 .addReg(SPLimitVReg); 12214 BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 12215 12216 // Calls into a routine in libgcc to allocate more space from the heap. 12217 const uint32_t *RegMask = 12218 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 12219 if (Is64Bit) { 12220 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 12221 .addReg(sizeVReg); 12222 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 12223 .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI) 12224 .addRegMask(RegMask) 12225 .addReg(X86::RAX, RegState::ImplicitDefine); 12226 } else { 12227 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 12228 .addImm(12); 12229 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 12230 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 12231 .addExternalSymbol("__morestack_allocate_stack_space") 12232 .addRegMask(RegMask) 12233 .addReg(X86::EAX, RegState::ImplicitDefine); 12234 } 12235 12236 if (!Is64Bit) 12237 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 12238 .addImm(16); 12239 12240 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 12241 .addReg(Is64Bit ? X86::RAX : X86::EAX); 12242 BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 12243 12244 // Set up the CFG correctly. 12245 BB->addSuccessor(bumpMBB); 12246 BB->addSuccessor(mallocMBB); 12247 mallocMBB->addSuccessor(continueMBB); 12248 bumpMBB->addSuccessor(continueMBB); 12249 12250 // Take care of the PHI nodes. 12251 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 12252 MI->getOperand(0).getReg()) 12253 .addReg(mallocPtrVReg).addMBB(mallocMBB) 12254 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 12255 12256 // Delete the original pseudo instruction. 12257 MI->eraseFromParent(); 12258 12259 // And we're done. 12260 return continueMBB; 12261} 12262 12263MachineBasicBlock * 12264X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 12265 MachineBasicBlock *BB) const { 12266 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12267 DebugLoc DL = MI->getDebugLoc(); 12268 12269 assert(!Subtarget->isTargetEnvMacho()); 12270 12271 // The lowering is pretty easy: we're just emitting the call to _alloca. The 12272 // non-trivial part is impdef of ESP. 12273 12274 if (Subtarget->isTargetWin64()) { 12275 if (Subtarget->isTargetCygMing()) { 12276 // ___chkstk(Mingw64): 12277 // Clobbers R10, R11, RAX and EFLAGS. 12278 // Updates RSP. 12279 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 12280 .addExternalSymbol("___chkstk") 12281 .addReg(X86::RAX, RegState::Implicit) 12282 .addReg(X86::RSP, RegState::Implicit) 12283 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 12284 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 12285 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 12286 } else { 12287 // __chkstk(MSVCRT): does not update stack pointer. 12288 // Clobbers R10, R11 and EFLAGS. 12289 // FIXME: RAX(allocated size) might be reused and not killed. 12290 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 12291 .addExternalSymbol("__chkstk") 12292 .addReg(X86::RAX, RegState::Implicit) 12293 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 12294 // RAX has the offset to subtracted from RSP. 12295 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 12296 .addReg(X86::RSP) 12297 .addReg(X86::RAX); 12298 } 12299 } else { 12300 const char *StackProbeSymbol = 12301 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 12302 12303 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 12304 .addExternalSymbol(StackProbeSymbol) 12305 .addReg(X86::EAX, RegState::Implicit) 12306 .addReg(X86::ESP, RegState::Implicit) 12307 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 12308 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 12309 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 12310 } 12311 12312 MI->eraseFromParent(); // The pseudo instruction is gone now. 12313 return BB; 12314} 12315 12316MachineBasicBlock * 12317X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 12318 MachineBasicBlock *BB) const { 12319 // This is pretty easy. We're taking the value that we received from 12320 // our load from the relocation, sticking it in either RDI (x86-64) 12321 // or EAX and doing an indirect call. The return value will then 12322 // be in the normal return register. 12323 const X86InstrInfo *TII 12324 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 12325 DebugLoc DL = MI->getDebugLoc(); 12326 MachineFunction *F = BB->getParent(); 12327 12328 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 12329 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 12330 12331 // Get a register mask for the lowered call. 12332 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 12333 // proper register mask. 12334 const uint32_t *RegMask = 12335 getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); 12336 if (Subtarget->is64Bit()) { 12337 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 12338 TII->get(X86::MOV64rm), X86::RDI) 12339 .addReg(X86::RIP) 12340 .addImm(0).addReg(0) 12341 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 12342 MI->getOperand(3).getTargetFlags()) 12343 .addReg(0); 12344 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 12345 addDirectMem(MIB, X86::RDI); 12346 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 12347 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 12348 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 12349 TII->get(X86::MOV32rm), X86::EAX) 12350 .addReg(0) 12351 .addImm(0).addReg(0) 12352 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 12353 MI->getOperand(3).getTargetFlags()) 12354 .addReg(0); 12355 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 12356 addDirectMem(MIB, X86::EAX); 12357 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 12358 } else { 12359 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 12360 TII->get(X86::MOV32rm), X86::EAX) 12361 .addReg(TII->getGlobalBaseReg(F)) 12362 .addImm(0).addReg(0) 12363 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 12364 MI->getOperand(3).getTargetFlags()) 12365 .addReg(0); 12366 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 12367 addDirectMem(MIB, X86::EAX); 12368 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 12369 } 12370 12371 MI->eraseFromParent(); // The pseudo instruction is gone now. 12372 return BB; 12373} 12374 12375MachineBasicBlock * 12376X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 12377 MachineBasicBlock *BB) const { 12378 switch (MI->getOpcode()) { 12379 default: llvm_unreachable("Unexpected instr type to insert"); 12380 case X86::TAILJMPd64: 12381 case X86::TAILJMPr64: 12382 case X86::TAILJMPm64: 12383 llvm_unreachable("TAILJMP64 would not be touched here."); 12384 case X86::TCRETURNdi64: 12385 case X86::TCRETURNri64: 12386 case X86::TCRETURNmi64: 12387 return BB; 12388 case X86::WIN_ALLOCA: 12389 return EmitLoweredWinAlloca(MI, BB); 12390 case X86::SEG_ALLOCA_32: 12391 return EmitLoweredSegAlloca(MI, BB, false); 12392 case X86::SEG_ALLOCA_64: 12393 return EmitLoweredSegAlloca(MI, BB, true); 12394 case X86::TLSCall_32: 12395 case X86::TLSCall_64: 12396 return EmitLoweredTLSCall(MI, BB); 12397 case X86::CMOV_GR8: 12398 case X86::CMOV_FR32: 12399 case X86::CMOV_FR64: 12400 case X86::CMOV_V4F32: 12401 case X86::CMOV_V2F64: 12402 case X86::CMOV_V2I64: 12403 case X86::CMOV_V8F32: 12404 case X86::CMOV_V4F64: 12405 case X86::CMOV_V4I64: 12406 case X86::CMOV_GR16: 12407 case X86::CMOV_GR32: 12408 case X86::CMOV_RFP32: 12409 case X86::CMOV_RFP64: 12410 case X86::CMOV_RFP80: 12411 return EmitLoweredSelect(MI, BB); 12412 12413 case X86::FP32_TO_INT16_IN_MEM: 12414 case X86::FP32_TO_INT32_IN_MEM: 12415 case X86::FP32_TO_INT64_IN_MEM: 12416 case X86::FP64_TO_INT16_IN_MEM: 12417 case X86::FP64_TO_INT32_IN_MEM: 12418 case X86::FP64_TO_INT64_IN_MEM: 12419 case X86::FP80_TO_INT16_IN_MEM: 12420 case X86::FP80_TO_INT32_IN_MEM: 12421 case X86::FP80_TO_INT64_IN_MEM: { 12422 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12423 DebugLoc DL = MI->getDebugLoc(); 12424 12425 // Change the floating point control register to use "round towards zero" 12426 // mode when truncating to an integer value. 12427 MachineFunction *F = BB->getParent(); 12428 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 12429 addFrameReference(BuildMI(*BB, MI, DL, 12430 TII->get(X86::FNSTCW16m)), CWFrameIdx); 12431 12432 // Load the old value of the high byte of the control word... 12433 unsigned OldCW = 12434 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 12435 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 12436 CWFrameIdx); 12437 12438 // Set the high part to be round to zero... 12439 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 12440 .addImm(0xC7F); 12441 12442 // Reload the modified control word now... 12443 addFrameReference(BuildMI(*BB, MI, DL, 12444 TII->get(X86::FLDCW16m)), CWFrameIdx); 12445 12446 // Restore the memory image of control word to original value 12447 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 12448 .addReg(OldCW); 12449 12450 // Get the X86 opcode to use. 12451 unsigned Opc; 12452 switch (MI->getOpcode()) { 12453 default: llvm_unreachable("illegal opcode!"); 12454 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 12455 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 12456 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 12457 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 12458 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 12459 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 12460 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 12461 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 12462 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 12463 } 12464 12465 X86AddressMode AM; 12466 MachineOperand &Op = MI->getOperand(0); 12467 if (Op.isReg()) { 12468 AM.BaseType = X86AddressMode::RegBase; 12469 AM.Base.Reg = Op.getReg(); 12470 } else { 12471 AM.BaseType = X86AddressMode::FrameIndexBase; 12472 AM.Base.FrameIndex = Op.getIndex(); 12473 } 12474 Op = MI->getOperand(1); 12475 if (Op.isImm()) 12476 AM.Scale = Op.getImm(); 12477 Op = MI->getOperand(2); 12478 if (Op.isImm()) 12479 AM.IndexReg = Op.getImm(); 12480 Op = MI->getOperand(3); 12481 if (Op.isGlobal()) { 12482 AM.GV = Op.getGlobal(); 12483 } else { 12484 AM.Disp = Op.getImm(); 12485 } 12486 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 12487 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 12488 12489 // Reload the original control word now. 12490 addFrameReference(BuildMI(*BB, MI, DL, 12491 TII->get(X86::FLDCW16m)), CWFrameIdx); 12492 12493 MI->eraseFromParent(); // The pseudo instruction is gone now. 12494 return BB; 12495 } 12496 // String/text processing lowering. 12497 case X86::PCMPISTRM128REG: 12498 case X86::VPCMPISTRM128REG: 12499 return EmitPCMP(MI, BB, 3, false /* in-mem */); 12500 case X86::PCMPISTRM128MEM: 12501 case X86::VPCMPISTRM128MEM: 12502 return EmitPCMP(MI, BB, 3, true /* in-mem */); 12503 case X86::PCMPESTRM128REG: 12504 case X86::VPCMPESTRM128REG: 12505 return EmitPCMP(MI, BB, 5, false /* in mem */); 12506 case X86::PCMPESTRM128MEM: 12507 case X86::VPCMPESTRM128MEM: 12508 return EmitPCMP(MI, BB, 5, true /* in mem */); 12509 12510 // Thread synchronization. 12511 case X86::MONITOR: 12512 return EmitMonitor(MI, BB); 12513 case X86::MWAIT: 12514 return EmitMwait(MI, BB); 12515 12516 // Atomic Lowering. 12517 case X86::ATOMAND32: 12518 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 12519 X86::AND32ri, X86::MOV32rm, 12520 X86::LCMPXCHG32, 12521 X86::NOT32r, X86::EAX, 12522 X86::GR32RegisterClass); 12523 case X86::ATOMOR32: 12524 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 12525 X86::OR32ri, X86::MOV32rm, 12526 X86::LCMPXCHG32, 12527 X86::NOT32r, X86::EAX, 12528 X86::GR32RegisterClass); 12529 case X86::ATOMXOR32: 12530 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 12531 X86::XOR32ri, X86::MOV32rm, 12532 X86::LCMPXCHG32, 12533 X86::NOT32r, X86::EAX, 12534 X86::GR32RegisterClass); 12535 case X86::ATOMNAND32: 12536 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 12537 X86::AND32ri, X86::MOV32rm, 12538 X86::LCMPXCHG32, 12539 X86::NOT32r, X86::EAX, 12540 X86::GR32RegisterClass, true); 12541 case X86::ATOMMIN32: 12542 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 12543 case X86::ATOMMAX32: 12544 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 12545 case X86::ATOMUMIN32: 12546 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 12547 case X86::ATOMUMAX32: 12548 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 12549 12550 case X86::ATOMAND16: 12551 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 12552 X86::AND16ri, X86::MOV16rm, 12553 X86::LCMPXCHG16, 12554 X86::NOT16r, X86::AX, 12555 X86::GR16RegisterClass); 12556 case X86::ATOMOR16: 12557 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 12558 X86::OR16ri, X86::MOV16rm, 12559 X86::LCMPXCHG16, 12560 X86::NOT16r, X86::AX, 12561 X86::GR16RegisterClass); 12562 case X86::ATOMXOR16: 12563 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 12564 X86::XOR16ri, X86::MOV16rm, 12565 X86::LCMPXCHG16, 12566 X86::NOT16r, X86::AX, 12567 X86::GR16RegisterClass); 12568 case X86::ATOMNAND16: 12569 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 12570 X86::AND16ri, X86::MOV16rm, 12571 X86::LCMPXCHG16, 12572 X86::NOT16r, X86::AX, 12573 X86::GR16RegisterClass, true); 12574 case X86::ATOMMIN16: 12575 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 12576 case X86::ATOMMAX16: 12577 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 12578 case X86::ATOMUMIN16: 12579 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 12580 case X86::ATOMUMAX16: 12581 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 12582 12583 case X86::ATOMAND8: 12584 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 12585 X86::AND8ri, X86::MOV8rm, 12586 X86::LCMPXCHG8, 12587 X86::NOT8r, X86::AL, 12588 X86::GR8RegisterClass); 12589 case X86::ATOMOR8: 12590 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 12591 X86::OR8ri, X86::MOV8rm, 12592 X86::LCMPXCHG8, 12593 X86::NOT8r, X86::AL, 12594 X86::GR8RegisterClass); 12595 case X86::ATOMXOR8: 12596 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 12597 X86::XOR8ri, X86::MOV8rm, 12598 X86::LCMPXCHG8, 12599 X86::NOT8r, X86::AL, 12600 X86::GR8RegisterClass); 12601 case X86::ATOMNAND8: 12602 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 12603 X86::AND8ri, X86::MOV8rm, 12604 X86::LCMPXCHG8, 12605 X86::NOT8r, X86::AL, 12606 X86::GR8RegisterClass, true); 12607 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 12608 // This group is for 64-bit host. 12609 case X86::ATOMAND64: 12610 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 12611 X86::AND64ri32, X86::MOV64rm, 12612 X86::LCMPXCHG64, 12613 X86::NOT64r, X86::RAX, 12614 X86::GR64RegisterClass); 12615 case X86::ATOMOR64: 12616 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 12617 X86::OR64ri32, X86::MOV64rm, 12618 X86::LCMPXCHG64, 12619 X86::NOT64r, X86::RAX, 12620 X86::GR64RegisterClass); 12621 case X86::ATOMXOR64: 12622 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 12623 X86::XOR64ri32, X86::MOV64rm, 12624 X86::LCMPXCHG64, 12625 X86::NOT64r, X86::RAX, 12626 X86::GR64RegisterClass); 12627 case X86::ATOMNAND64: 12628 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 12629 X86::AND64ri32, X86::MOV64rm, 12630 X86::LCMPXCHG64, 12631 X86::NOT64r, X86::RAX, 12632 X86::GR64RegisterClass, true); 12633 case X86::ATOMMIN64: 12634 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 12635 case X86::ATOMMAX64: 12636 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 12637 case X86::ATOMUMIN64: 12638 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 12639 case X86::ATOMUMAX64: 12640 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 12641 12642 // This group does 64-bit operations on a 32-bit host. 12643 case X86::ATOMAND6432: 12644 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12645 X86::AND32rr, X86::AND32rr, 12646 X86::AND32ri, X86::AND32ri, 12647 false); 12648 case X86::ATOMOR6432: 12649 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12650 X86::OR32rr, X86::OR32rr, 12651 X86::OR32ri, X86::OR32ri, 12652 false); 12653 case X86::ATOMXOR6432: 12654 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12655 X86::XOR32rr, X86::XOR32rr, 12656 X86::XOR32ri, X86::XOR32ri, 12657 false); 12658 case X86::ATOMNAND6432: 12659 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12660 X86::AND32rr, X86::AND32rr, 12661 X86::AND32ri, X86::AND32ri, 12662 true); 12663 case X86::ATOMADD6432: 12664 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12665 X86::ADD32rr, X86::ADC32rr, 12666 X86::ADD32ri, X86::ADC32ri, 12667 false); 12668 case X86::ATOMSUB6432: 12669 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12670 X86::SUB32rr, X86::SBB32rr, 12671 X86::SUB32ri, X86::SBB32ri, 12672 false); 12673 case X86::ATOMSWAP6432: 12674 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12675 X86::MOV32rr, X86::MOV32rr, 12676 X86::MOV32ri, X86::MOV32ri, 12677 false); 12678 case X86::VASTART_SAVE_XMM_REGS: 12679 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 12680 12681 case X86::VAARG_64: 12682 return EmitVAARG64WithCustomInserter(MI, BB); 12683 } 12684} 12685 12686//===----------------------------------------------------------------------===// 12687// X86 Optimization Hooks 12688//===----------------------------------------------------------------------===// 12689 12690void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 12691 const APInt &Mask, 12692 APInt &KnownZero, 12693 APInt &KnownOne, 12694 const SelectionDAG &DAG, 12695 unsigned Depth) const { 12696 unsigned Opc = Op.getOpcode(); 12697 assert((Opc >= ISD::BUILTIN_OP_END || 12698 Opc == ISD::INTRINSIC_WO_CHAIN || 12699 Opc == ISD::INTRINSIC_W_CHAIN || 12700 Opc == ISD::INTRINSIC_VOID) && 12701 "Should use MaskedValueIsZero if you don't know whether Op" 12702 " is a target node!"); 12703 12704 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 12705 switch (Opc) { 12706 default: break; 12707 case X86ISD::ADD: 12708 case X86ISD::SUB: 12709 case X86ISD::ADC: 12710 case X86ISD::SBB: 12711 case X86ISD::SMUL: 12712 case X86ISD::UMUL: 12713 case X86ISD::INC: 12714 case X86ISD::DEC: 12715 case X86ISD::OR: 12716 case X86ISD::XOR: 12717 case X86ISD::AND: 12718 // These nodes' second result is a boolean. 12719 if (Op.getResNo() == 0) 12720 break; 12721 // Fallthrough 12722 case X86ISD::SETCC: 12723 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 12724 Mask.getBitWidth() - 1); 12725 break; 12726 case ISD::INTRINSIC_WO_CHAIN: { 12727 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 12728 unsigned NumLoBits = 0; 12729 switch (IntId) { 12730 default: break; 12731 case Intrinsic::x86_sse_movmsk_ps: 12732 case Intrinsic::x86_avx_movmsk_ps_256: 12733 case Intrinsic::x86_sse2_movmsk_pd: 12734 case Intrinsic::x86_avx_movmsk_pd_256: 12735 case Intrinsic::x86_mmx_pmovmskb: 12736 case Intrinsic::x86_sse2_pmovmskb_128: 12737 case Intrinsic::x86_avx2_pmovmskb: { 12738 // High bits of movmskp{s|d}, pmovmskb are known zero. 12739 switch (IntId) { 12740 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 12741 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 12742 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 12743 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 12744 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 12745 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 12746 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 12747 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; 12748 } 12749 KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(), 12750 Mask.getBitWidth() - NumLoBits); 12751 break; 12752 } 12753 } 12754 break; 12755 } 12756 } 12757} 12758 12759unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 12760 unsigned Depth) const { 12761 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 12762 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 12763 return Op.getValueType().getScalarType().getSizeInBits(); 12764 12765 // Fallback case. 12766 return 1; 12767} 12768 12769/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 12770/// node is a GlobalAddress + offset. 12771bool X86TargetLowering::isGAPlusOffset(SDNode *N, 12772 const GlobalValue* &GA, 12773 int64_t &Offset) const { 12774 if (N->getOpcode() == X86ISD::Wrapper) { 12775 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 12776 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 12777 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 12778 return true; 12779 } 12780 } 12781 return TargetLowering::isGAPlusOffset(N, GA, Offset); 12782} 12783 12784/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 12785/// same as extracting the high 128-bit part of 256-bit vector and then 12786/// inserting the result into the low part of a new 256-bit vector 12787static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 12788 EVT VT = SVOp->getValueType(0); 12789 int NumElems = VT.getVectorNumElements(); 12790 12791 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 12792 for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j) 12793 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 12794 SVOp->getMaskElt(j) >= 0) 12795 return false; 12796 12797 return true; 12798} 12799 12800/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 12801/// same as extracting the low 128-bit part of 256-bit vector and then 12802/// inserting the result into the high part of a new 256-bit vector 12803static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 12804 EVT VT = SVOp->getValueType(0); 12805 int NumElems = VT.getVectorNumElements(); 12806 12807 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 12808 for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j) 12809 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 12810 SVOp->getMaskElt(j) >= 0) 12811 return false; 12812 12813 return true; 12814} 12815 12816/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 12817static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 12818 TargetLowering::DAGCombinerInfo &DCI, 12819 const X86Subtarget* Subtarget) { 12820 DebugLoc dl = N->getDebugLoc(); 12821 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 12822 SDValue V1 = SVOp->getOperand(0); 12823 SDValue V2 = SVOp->getOperand(1); 12824 EVT VT = SVOp->getValueType(0); 12825 int NumElems = VT.getVectorNumElements(); 12826 12827 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 12828 V2.getOpcode() == ISD::CONCAT_VECTORS) { 12829 // 12830 // 0,0,0,... 12831 // | 12832 // V UNDEF BUILD_VECTOR UNDEF 12833 // \ / \ / 12834 // CONCAT_VECTOR CONCAT_VECTOR 12835 // \ / 12836 // \ / 12837 // RESULT: V + zero extended 12838 // 12839 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 12840 V2.getOperand(1).getOpcode() != ISD::UNDEF || 12841 V1.getOperand(1).getOpcode() != ISD::UNDEF) 12842 return SDValue(); 12843 12844 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 12845 return SDValue(); 12846 12847 // To match the shuffle mask, the first half of the mask should 12848 // be exactly the first vector, and all the rest a splat with the 12849 // first element of the second one. 12850 for (int i = 0; i < NumElems/2; ++i) 12851 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 12852 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 12853 return SDValue(); 12854 12855 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 12856 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 12857 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 12858 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 12859 SDValue ResNode = 12860 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, 12861 Ld->getMemoryVT(), 12862 Ld->getPointerInfo(), 12863 Ld->getAlignment(), 12864 false/*isVolatile*/, true/*ReadMem*/, 12865 false/*WriteMem*/); 12866 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); 12867 } 12868 12869 // Emit a zeroed vector and insert the desired subvector on its 12870 // first half. 12871 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 12872 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 12873 DAG.getConstant(0, MVT::i32), DAG, dl); 12874 return DCI.CombineTo(N, InsV); 12875 } 12876 12877 //===--------------------------------------------------------------------===// 12878 // Combine some shuffles into subvector extracts and inserts: 12879 // 12880 12881 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 12882 if (isShuffleHigh128VectorInsertLow(SVOp)) { 12883 SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32), 12884 DAG, dl); 12885 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), 12886 V, DAG.getConstant(0, MVT::i32), DAG, dl); 12887 return DCI.CombineTo(N, InsV); 12888 } 12889 12890 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 12891 if (isShuffleLow128VectorInsertHigh(SVOp)) { 12892 SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl); 12893 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), 12894 V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 12895 return DCI.CombineTo(N, InsV); 12896 } 12897 12898 return SDValue(); 12899} 12900 12901/// PerformShuffleCombine - Performs several different shuffle combines. 12902static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 12903 TargetLowering::DAGCombinerInfo &DCI, 12904 const X86Subtarget *Subtarget) { 12905 DebugLoc dl = N->getDebugLoc(); 12906 EVT VT = N->getValueType(0); 12907 12908 // Don't create instructions with illegal types after legalize types has run. 12909 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12910 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 12911 return SDValue(); 12912 12913 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 12914 if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && 12915 N->getOpcode() == ISD::VECTOR_SHUFFLE) 12916 return PerformShuffleCombine256(N, DAG, DCI, Subtarget); 12917 12918 // Only handle 128 wide vector from here on. 12919 if (VT.getSizeInBits() != 128) 12920 return SDValue(); 12921 12922 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 12923 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 12924 // consecutive, non-overlapping, and in the right order. 12925 SmallVector<SDValue, 16> Elts; 12926 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 12927 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 12928 12929 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 12930} 12931 12932 12933/// PerformTruncateCombine - Converts truncate operation to 12934/// a sequence of vector shuffle operations. 12935/// It is possible when we truncate 256-bit vector to 128-bit vector 12936 12937SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 12938 DAGCombinerInfo &DCI) const { 12939 if (!DCI.isBeforeLegalizeOps()) 12940 return SDValue(); 12941 12942 if (!Subtarget->hasAVX()) return SDValue(); 12943 12944 EVT VT = N->getValueType(0); 12945 SDValue Op = N->getOperand(0); 12946 EVT OpVT = Op.getValueType(); 12947 DebugLoc dl = N->getDebugLoc(); 12948 12949 if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { 12950 12951 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, 12952 DAG.getIntPtrConstant(0)); 12953 12954 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, 12955 DAG.getIntPtrConstant(2)); 12956 12957 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); 12958 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); 12959 12960 // PSHUFD 12961 int ShufMask1[] = {0, 2, 0, 0}; 12962 12963 OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), 12964 ShufMask1); 12965 OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), 12966 ShufMask1); 12967 12968 // MOVLHPS 12969 int ShufMask2[] = {0, 1, 4, 5}; 12970 12971 return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2); 12972 } 12973 if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { 12974 12975 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, 12976 DAG.getIntPtrConstant(0)); 12977 12978 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, 12979 DAG.getIntPtrConstant(4)); 12980 12981 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo); 12982 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi); 12983 12984 // PSHUFB 12985 int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 12986 -1, -1, -1, -1, -1, -1, -1, -1}; 12987 12988 OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, 12989 DAG.getUNDEF(MVT::v16i8), 12990 ShufMask1); 12991 OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, 12992 DAG.getUNDEF(MVT::v16i8), 12993 ShufMask1); 12994 12995 OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); 12996 OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); 12997 12998 // MOVLHPS 12999 int ShufMask2[] = {0, 1, 4, 5}; 13000 13001 SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2); 13002 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res); 13003 } 13004 13005 return SDValue(); 13006} 13007 13008/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 13009/// generation and convert it from being a bunch of shuffles and extracts 13010/// to a simple store and scalar loads to extract the elements. 13011static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 13012 const TargetLowering &TLI) { 13013 SDValue InputVector = N->getOperand(0); 13014 13015 // Only operate on vectors of 4 elements, where the alternative shuffling 13016 // gets to be more expensive. 13017 if (InputVector.getValueType() != MVT::v4i32) 13018 return SDValue(); 13019 13020 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 13021 // single use which is a sign-extend or zero-extend, and all elements are 13022 // used. 13023 SmallVector<SDNode *, 4> Uses; 13024 unsigned ExtractedElements = 0; 13025 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 13026 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 13027 if (UI.getUse().getResNo() != InputVector.getResNo()) 13028 return SDValue(); 13029 13030 SDNode *Extract = *UI; 13031 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 13032 return SDValue(); 13033 13034 if (Extract->getValueType(0) != MVT::i32) 13035 return SDValue(); 13036 if (!Extract->hasOneUse()) 13037 return SDValue(); 13038 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 13039 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 13040 return SDValue(); 13041 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 13042 return SDValue(); 13043 13044 // Record which element was extracted. 13045 ExtractedElements |= 13046 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 13047 13048 Uses.push_back(Extract); 13049 } 13050 13051 // If not all the elements were used, this may not be worthwhile. 13052 if (ExtractedElements != 15) 13053 return SDValue(); 13054 13055 // Ok, we've now decided to do the transformation. 13056 DebugLoc dl = InputVector.getDebugLoc(); 13057 13058 // Store the value to a temporary stack slot. 13059 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 13060 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 13061 MachinePointerInfo(), false, false, 0); 13062 13063 // Replace each use (extract) with a load of the appropriate element. 13064 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 13065 UE = Uses.end(); UI != UE; ++UI) { 13066 SDNode *Extract = *UI; 13067 13068 // cOMpute the element's address. 13069 SDValue Idx = Extract->getOperand(1); 13070 unsigned EltSize = 13071 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 13072 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 13073 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 13074 13075 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 13076 StackPtr, OffsetVal); 13077 13078 // Load the scalar. 13079 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 13080 ScalarAddr, MachinePointerInfo(), 13081 false, false, false, 0); 13082 13083 // Replace the exact with the load. 13084 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 13085 } 13086 13087 // The replacement was made in place; don't return anything. 13088 return SDValue(); 13089} 13090 13091/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 13092/// nodes. 13093static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 13094 TargetLowering::DAGCombinerInfo &DCI, 13095 const X86Subtarget *Subtarget) { 13096 DebugLoc DL = N->getDebugLoc(); 13097 SDValue Cond = N->getOperand(0); 13098 // Get the LHS/RHS of the select. 13099 SDValue LHS = N->getOperand(1); 13100 SDValue RHS = N->getOperand(2); 13101 EVT VT = LHS.getValueType(); 13102 13103 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 13104 // instructions match the semantics of the common C idiom x<y?x:y but not 13105 // x<=y?x:y, because of how they handle negative zero (which can be 13106 // ignored in unsafe-math mode). 13107 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 13108 VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 13109 (Subtarget->hasSSE2() || 13110 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 13111 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 13112 13113 unsigned Opcode = 0; 13114 // Check for x CC y ? x : y. 13115 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 13116 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 13117 switch (CC) { 13118 default: break; 13119 case ISD::SETULT: 13120 // Converting this to a min would handle NaNs incorrectly, and swapping 13121 // the operands would cause it to handle comparisons between positive 13122 // and negative zero incorrectly. 13123 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 13124 if (!DAG.getTarget().Options.UnsafeFPMath && 13125 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 13126 break; 13127 std::swap(LHS, RHS); 13128 } 13129 Opcode = X86ISD::FMIN; 13130 break; 13131 case ISD::SETOLE: 13132 // Converting this to a min would handle comparisons between positive 13133 // and negative zero incorrectly. 13134 if (!DAG.getTarget().Options.UnsafeFPMath && 13135 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 13136 break; 13137 Opcode = X86ISD::FMIN; 13138 break; 13139 case ISD::SETULE: 13140 // Converting this to a min would handle both negative zeros and NaNs 13141 // incorrectly, but we can swap the operands to fix both. 13142 std::swap(LHS, RHS); 13143 case ISD::SETOLT: 13144 case ISD::SETLT: 13145 case ISD::SETLE: 13146 Opcode = X86ISD::FMIN; 13147 break; 13148 13149 case ISD::SETOGE: 13150 // Converting this to a max would handle comparisons between positive 13151 // and negative zero incorrectly. 13152 if (!DAG.getTarget().Options.UnsafeFPMath && 13153 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 13154 break; 13155 Opcode = X86ISD::FMAX; 13156 break; 13157 case ISD::SETUGT: 13158 // Converting this to a max would handle NaNs incorrectly, and swapping 13159 // the operands would cause it to handle comparisons between positive 13160 // and negative zero incorrectly. 13161 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 13162 if (!DAG.getTarget().Options.UnsafeFPMath && 13163 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 13164 break; 13165 std::swap(LHS, RHS); 13166 } 13167 Opcode = X86ISD::FMAX; 13168 break; 13169 case ISD::SETUGE: 13170 // Converting this to a max would handle both negative zeros and NaNs 13171 // incorrectly, but we can swap the operands to fix both. 13172 std::swap(LHS, RHS); 13173 case ISD::SETOGT: 13174 case ISD::SETGT: 13175 case ISD::SETGE: 13176 Opcode = X86ISD::FMAX; 13177 break; 13178 } 13179 // Check for x CC y ? y : x -- a min/max with reversed arms. 13180 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 13181 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 13182 switch (CC) { 13183 default: break; 13184 case ISD::SETOGE: 13185 // Converting this to a min would handle comparisons between positive 13186 // and negative zero incorrectly, and swapping the operands would 13187 // cause it to handle NaNs incorrectly. 13188 if (!DAG.getTarget().Options.UnsafeFPMath && 13189 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 13190 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 13191 break; 13192 std::swap(LHS, RHS); 13193 } 13194 Opcode = X86ISD::FMIN; 13195 break; 13196 case ISD::SETUGT: 13197 // Converting this to a min would handle NaNs incorrectly. 13198 if (!DAG.getTarget().Options.UnsafeFPMath && 13199 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 13200 break; 13201 Opcode = X86ISD::FMIN; 13202 break; 13203 case ISD::SETUGE: 13204 // Converting this to a min would handle both negative zeros and NaNs 13205 // incorrectly, but we can swap the operands to fix both. 13206 std::swap(LHS, RHS); 13207 case ISD::SETOGT: 13208 case ISD::SETGT: 13209 case ISD::SETGE: 13210 Opcode = X86ISD::FMIN; 13211 break; 13212 13213 case ISD::SETULT: 13214 // Converting this to a max would handle NaNs incorrectly. 13215 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 13216 break; 13217 Opcode = X86ISD::FMAX; 13218 break; 13219 case ISD::SETOLE: 13220 // Converting this to a max would handle comparisons between positive 13221 // and negative zero incorrectly, and swapping the operands would 13222 // cause it to handle NaNs incorrectly. 13223 if (!DAG.getTarget().Options.UnsafeFPMath && 13224 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 13225 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 13226 break; 13227 std::swap(LHS, RHS); 13228 } 13229 Opcode = X86ISD::FMAX; 13230 break; 13231 case ISD::SETULE: 13232 // Converting this to a max would handle both negative zeros and NaNs 13233 // incorrectly, but we can swap the operands to fix both. 13234 std::swap(LHS, RHS); 13235 case ISD::SETOLT: 13236 case ISD::SETLT: 13237 case ISD::SETLE: 13238 Opcode = X86ISD::FMAX; 13239 break; 13240 } 13241 } 13242 13243 if (Opcode) 13244 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 13245 } 13246 13247 // If this is a select between two integer constants, try to do some 13248 // optimizations. 13249 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 13250 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 13251 // Don't do this for crazy integer types. 13252 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 13253 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 13254 // so that TrueC (the true value) is larger than FalseC. 13255 bool NeedsCondInvert = false; 13256 13257 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 13258 // Efficiently invertible. 13259 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 13260 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 13261 isa<ConstantSDNode>(Cond.getOperand(1))))) { 13262 NeedsCondInvert = true; 13263 std::swap(TrueC, FalseC); 13264 } 13265 13266 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 13267 if (FalseC->getAPIntValue() == 0 && 13268 TrueC->getAPIntValue().isPowerOf2()) { 13269 if (NeedsCondInvert) // Invert the condition if needed. 13270 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 13271 DAG.getConstant(1, Cond.getValueType())); 13272 13273 // Zero extend the condition if needed. 13274 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 13275 13276 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 13277 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 13278 DAG.getConstant(ShAmt, MVT::i8)); 13279 } 13280 13281 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 13282 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 13283 if (NeedsCondInvert) // Invert the condition if needed. 13284 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 13285 DAG.getConstant(1, Cond.getValueType())); 13286 13287 // Zero extend the condition if needed. 13288 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 13289 FalseC->getValueType(0), Cond); 13290 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 13291 SDValue(FalseC, 0)); 13292 } 13293 13294 // Optimize cases that will turn into an LEA instruction. This requires 13295 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 13296 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 13297 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 13298 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 13299 13300 bool isFastMultiplier = false; 13301 if (Diff < 10) { 13302 switch ((unsigned char)Diff) { 13303 default: break; 13304 case 1: // result = add base, cond 13305 case 2: // result = lea base( , cond*2) 13306 case 3: // result = lea base(cond, cond*2) 13307 case 4: // result = lea base( , cond*4) 13308 case 5: // result = lea base(cond, cond*4) 13309 case 8: // result = lea base( , cond*8) 13310 case 9: // result = lea base(cond, cond*8) 13311 isFastMultiplier = true; 13312 break; 13313 } 13314 } 13315 13316 if (isFastMultiplier) { 13317 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 13318 if (NeedsCondInvert) // Invert the condition if needed. 13319 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 13320 DAG.getConstant(1, Cond.getValueType())); 13321 13322 // Zero extend the condition if needed. 13323 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 13324 Cond); 13325 // Scale the condition by the difference. 13326 if (Diff != 1) 13327 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 13328 DAG.getConstant(Diff, Cond.getValueType())); 13329 13330 // Add the base if non-zero. 13331 if (FalseC->getAPIntValue() != 0) 13332 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 13333 SDValue(FalseC, 0)); 13334 return Cond; 13335 } 13336 } 13337 } 13338 } 13339 13340 // Canonicalize max and min: 13341 // (x > y) ? x : y -> (x >= y) ? x : y 13342 // (x < y) ? x : y -> (x <= y) ? x : y 13343 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 13344 // the need for an extra compare 13345 // against zero. e.g. 13346 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 13347 // subl %esi, %edi 13348 // testl %edi, %edi 13349 // movl $0, %eax 13350 // cmovgl %edi, %eax 13351 // => 13352 // xorl %eax, %eax 13353 // subl %esi, $edi 13354 // cmovsl %eax, %edi 13355 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 13356 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 13357 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 13358 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 13359 switch (CC) { 13360 default: break; 13361 case ISD::SETLT: 13362 case ISD::SETGT: { 13363 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 13364 Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(), 13365 Cond.getOperand(0), Cond.getOperand(1), NewCC); 13366 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 13367 } 13368 } 13369 } 13370 13371 // If we know that this node is legal then we know that it is going to be 13372 // matched by one of the SSE/AVX BLEND instructions. These instructions only 13373 // depend on the highest bit in each word. Try to use SimplifyDemandedBits 13374 // to simplify previous instructions. 13375 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13376 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 13377 !DCI.isBeforeLegalize() && 13378 TLI.isOperationLegal(ISD::VSELECT, VT)) { 13379 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); 13380 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 13381 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 13382 13383 APInt KnownZero, KnownOne; 13384 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 13385 DCI.isBeforeLegalizeOps()); 13386 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 13387 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) 13388 DCI.CommitTargetLoweringOpt(TLO); 13389 } 13390 13391 return SDValue(); 13392} 13393 13394/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 13395static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 13396 TargetLowering::DAGCombinerInfo &DCI) { 13397 DebugLoc DL = N->getDebugLoc(); 13398 13399 // If the flag operand isn't dead, don't touch this CMOV. 13400 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 13401 return SDValue(); 13402 13403 SDValue FalseOp = N->getOperand(0); 13404 SDValue TrueOp = N->getOperand(1); 13405 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 13406 SDValue Cond = N->getOperand(3); 13407 if (CC == X86::COND_E || CC == X86::COND_NE) { 13408 switch (Cond.getOpcode()) { 13409 default: break; 13410 case X86ISD::BSR: 13411 case X86ISD::BSF: 13412 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 13413 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 13414 return (CC == X86::COND_E) ? FalseOp : TrueOp; 13415 } 13416 } 13417 13418 // If this is a select between two integer constants, try to do some 13419 // optimizations. Note that the operands are ordered the opposite of SELECT 13420 // operands. 13421 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 13422 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 13423 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 13424 // larger than FalseC (the false value). 13425 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 13426 CC = X86::GetOppositeBranchCondition(CC); 13427 std::swap(TrueC, FalseC); 13428 } 13429 13430 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 13431 // This is efficient for any integer data type (including i8/i16) and 13432 // shift amount. 13433 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 13434 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 13435 DAG.getConstant(CC, MVT::i8), Cond); 13436 13437 // Zero extend the condition if needed. 13438 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 13439 13440 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 13441 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 13442 DAG.getConstant(ShAmt, MVT::i8)); 13443 if (N->getNumValues() == 2) // Dead flag value? 13444 return DCI.CombineTo(N, Cond, SDValue()); 13445 return Cond; 13446 } 13447 13448 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 13449 // for any integer data type, including i8/i16. 13450 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 13451 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 13452 DAG.getConstant(CC, MVT::i8), Cond); 13453 13454 // Zero extend the condition if needed. 13455 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 13456 FalseC->getValueType(0), Cond); 13457 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 13458 SDValue(FalseC, 0)); 13459 13460 if (N->getNumValues() == 2) // Dead flag value? 13461 return DCI.CombineTo(N, Cond, SDValue()); 13462 return Cond; 13463 } 13464 13465 // Optimize cases that will turn into an LEA instruction. This requires 13466 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 13467 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 13468 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 13469 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 13470 13471 bool isFastMultiplier = false; 13472 if (Diff < 10) { 13473 switch ((unsigned char)Diff) { 13474 default: break; 13475 case 1: // result = add base, cond 13476 case 2: // result = lea base( , cond*2) 13477 case 3: // result = lea base(cond, cond*2) 13478 case 4: // result = lea base( , cond*4) 13479 case 5: // result = lea base(cond, cond*4) 13480 case 8: // result = lea base( , cond*8) 13481 case 9: // result = lea base(cond, cond*8) 13482 isFastMultiplier = true; 13483 break; 13484 } 13485 } 13486 13487 if (isFastMultiplier) { 13488 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 13489 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 13490 DAG.getConstant(CC, MVT::i8), Cond); 13491 // Zero extend the condition if needed. 13492 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 13493 Cond); 13494 // Scale the condition by the difference. 13495 if (Diff != 1) 13496 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 13497 DAG.getConstant(Diff, Cond.getValueType())); 13498 13499 // Add the base if non-zero. 13500 if (FalseC->getAPIntValue() != 0) 13501 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 13502 SDValue(FalseC, 0)); 13503 if (N->getNumValues() == 2) // Dead flag value? 13504 return DCI.CombineTo(N, Cond, SDValue()); 13505 return Cond; 13506 } 13507 } 13508 } 13509 } 13510 return SDValue(); 13511} 13512 13513 13514/// PerformMulCombine - Optimize a single multiply with constant into two 13515/// in order to implement it with two cheaper instructions, e.g. 13516/// LEA + SHL, LEA + LEA. 13517static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 13518 TargetLowering::DAGCombinerInfo &DCI) { 13519 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13520 return SDValue(); 13521 13522 EVT VT = N->getValueType(0); 13523 if (VT != MVT::i64) 13524 return SDValue(); 13525 13526 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13527 if (!C) 13528 return SDValue(); 13529 uint64_t MulAmt = C->getZExtValue(); 13530 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 13531 return SDValue(); 13532 13533 uint64_t MulAmt1 = 0; 13534 uint64_t MulAmt2 = 0; 13535 if ((MulAmt % 9) == 0) { 13536 MulAmt1 = 9; 13537 MulAmt2 = MulAmt / 9; 13538 } else if ((MulAmt % 5) == 0) { 13539 MulAmt1 = 5; 13540 MulAmt2 = MulAmt / 5; 13541 } else if ((MulAmt % 3) == 0) { 13542 MulAmt1 = 3; 13543 MulAmt2 = MulAmt / 3; 13544 } 13545 if (MulAmt2 && 13546 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 13547 DebugLoc DL = N->getDebugLoc(); 13548 13549 if (isPowerOf2_64(MulAmt2) && 13550 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 13551 // If second multiplifer is pow2, issue it first. We want the multiply by 13552 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 13553 // is an add. 13554 std::swap(MulAmt1, MulAmt2); 13555 13556 SDValue NewMul; 13557 if (isPowerOf2_64(MulAmt1)) 13558 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 13559 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 13560 else 13561 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 13562 DAG.getConstant(MulAmt1, VT)); 13563 13564 if (isPowerOf2_64(MulAmt2)) 13565 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 13566 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 13567 else 13568 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 13569 DAG.getConstant(MulAmt2, VT)); 13570 13571 // Do not add new nodes to DAG combiner worklist. 13572 DCI.CombineTo(N, NewMul, false); 13573 } 13574 return SDValue(); 13575} 13576 13577static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 13578 SDValue N0 = N->getOperand(0); 13579 SDValue N1 = N->getOperand(1); 13580 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 13581 EVT VT = N0.getValueType(); 13582 13583 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 13584 // since the result of setcc_c is all zero's or all ones. 13585 if (VT.isInteger() && !VT.isVector() && 13586 N1C && N0.getOpcode() == ISD::AND && 13587 N0.getOperand(1).getOpcode() == ISD::Constant) { 13588 SDValue N00 = N0.getOperand(0); 13589 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 13590 ((N00.getOpcode() == ISD::ANY_EXTEND || 13591 N00.getOpcode() == ISD::ZERO_EXTEND) && 13592 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 13593 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 13594 APInt ShAmt = N1C->getAPIntValue(); 13595 Mask = Mask.shl(ShAmt); 13596 if (Mask != 0) 13597 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 13598 N00, DAG.getConstant(Mask, VT)); 13599 } 13600 } 13601 13602 13603 // Hardware support for vector shifts is sparse which makes us scalarize the 13604 // vector operations in many cases. Also, on sandybridge ADD is faster than 13605 // shl. 13606 // (shl V, 1) -> add V,V 13607 if (isSplatVector(N1.getNode())) { 13608 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 13609 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); 13610 // We shift all of the values by one. In many cases we do not have 13611 // hardware support for this operation. This is better expressed as an ADD 13612 // of two values. 13613 if (N1C && (1 == N1C->getZExtValue())) { 13614 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); 13615 } 13616 } 13617 13618 return SDValue(); 13619} 13620 13621/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 13622/// when possible. 13623static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 13624 TargetLowering::DAGCombinerInfo &DCI, 13625 const X86Subtarget *Subtarget) { 13626 EVT VT = N->getValueType(0); 13627 if (N->getOpcode() == ISD::SHL) { 13628 SDValue V = PerformSHLCombine(N, DAG); 13629 if (V.getNode()) return V; 13630 } 13631 13632 // On X86 with SSE2 support, we can transform this to a vector shift if 13633 // all elements are shifted by the same amount. We can't do this in legalize 13634 // because the a constant vector is typically transformed to a constant pool 13635 // so we have no knowledge of the shift amount. 13636 if (!Subtarget->hasSSE2()) 13637 return SDValue(); 13638 13639 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 13640 (!Subtarget->hasAVX2() || 13641 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 13642 return SDValue(); 13643 13644 SDValue ShAmtOp = N->getOperand(1); 13645 EVT EltVT = VT.getVectorElementType(); 13646 DebugLoc DL = N->getDebugLoc(); 13647 SDValue BaseShAmt = SDValue(); 13648 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 13649 unsigned NumElts = VT.getVectorNumElements(); 13650 unsigned i = 0; 13651 for (; i != NumElts; ++i) { 13652 SDValue Arg = ShAmtOp.getOperand(i); 13653 if (Arg.getOpcode() == ISD::UNDEF) continue; 13654 BaseShAmt = Arg; 13655 break; 13656 } 13657 // Handle the case where the build_vector is all undef 13658 // FIXME: Should DAG allow this? 13659 if (i == NumElts) 13660 return SDValue(); 13661 13662 for (; i != NumElts; ++i) { 13663 SDValue Arg = ShAmtOp.getOperand(i); 13664 if (Arg.getOpcode() == ISD::UNDEF) continue; 13665 if (Arg != BaseShAmt) { 13666 return SDValue(); 13667 } 13668 } 13669 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 13670 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 13671 SDValue InVec = ShAmtOp.getOperand(0); 13672 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 13673 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 13674 unsigned i = 0; 13675 for (; i != NumElts; ++i) { 13676 SDValue Arg = InVec.getOperand(i); 13677 if (Arg.getOpcode() == ISD::UNDEF) continue; 13678 BaseShAmt = Arg; 13679 break; 13680 } 13681 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 13682 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 13683 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 13684 if (C->getZExtValue() == SplatIdx) 13685 BaseShAmt = InVec.getOperand(1); 13686 } 13687 } 13688 if (BaseShAmt.getNode() == 0) { 13689 // Don't create instructions with illegal types after legalize 13690 // types has run. 13691 if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) && 13692 !DCI.isBeforeLegalize()) 13693 return SDValue(); 13694 13695 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 13696 DAG.getIntPtrConstant(0)); 13697 } 13698 } else 13699 return SDValue(); 13700 13701 // The shift amount is an i32. 13702 if (EltVT.bitsGT(MVT::i32)) 13703 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 13704 else if (EltVT.bitsLT(MVT::i32)) 13705 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 13706 13707 // The shift amount is identical so we can do a vector shift. 13708 SDValue ValOp = N->getOperand(0); 13709 switch (N->getOpcode()) { 13710 default: 13711 llvm_unreachable("Unknown shift opcode!"); 13712 case ISD::SHL: 13713 switch (VT.getSimpleVT().SimpleTy) { 13714 default: return SDValue(); 13715 case MVT::v2i64: 13716 case MVT::v4i32: 13717 case MVT::v8i16: 13718 case MVT::v4i64: 13719 case MVT::v8i32: 13720 case MVT::v16i16: 13721 return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG); 13722 } 13723 case ISD::SRA: 13724 switch (VT.getSimpleVT().SimpleTy) { 13725 default: return SDValue(); 13726 case MVT::v4i32: 13727 case MVT::v8i16: 13728 case MVT::v8i32: 13729 case MVT::v16i16: 13730 return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG); 13731 } 13732 case ISD::SRL: 13733 switch (VT.getSimpleVT().SimpleTy) { 13734 default: return SDValue(); 13735 case MVT::v2i64: 13736 case MVT::v4i32: 13737 case MVT::v8i16: 13738 case MVT::v4i64: 13739 case MVT::v8i32: 13740 case MVT::v16i16: 13741 return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG); 13742 } 13743 } 13744} 13745 13746 13747// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 13748// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 13749// and friends. Likewise for OR -> CMPNEQSS. 13750static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 13751 TargetLowering::DAGCombinerInfo &DCI, 13752 const X86Subtarget *Subtarget) { 13753 unsigned opcode; 13754 13755 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 13756 // we're requiring SSE2 for both. 13757 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 13758 SDValue N0 = N->getOperand(0); 13759 SDValue N1 = N->getOperand(1); 13760 SDValue CMP0 = N0->getOperand(1); 13761 SDValue CMP1 = N1->getOperand(1); 13762 DebugLoc DL = N->getDebugLoc(); 13763 13764 // The SETCCs should both refer to the same CMP. 13765 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 13766 return SDValue(); 13767 13768 SDValue CMP00 = CMP0->getOperand(0); 13769 SDValue CMP01 = CMP0->getOperand(1); 13770 EVT VT = CMP00.getValueType(); 13771 13772 if (VT == MVT::f32 || VT == MVT::f64) { 13773 bool ExpectingFlags = false; 13774 // Check for any users that want flags: 13775 for (SDNode::use_iterator UI = N->use_begin(), 13776 UE = N->use_end(); 13777 !ExpectingFlags && UI != UE; ++UI) 13778 switch (UI->getOpcode()) { 13779 default: 13780 case ISD::BR_CC: 13781 case ISD::BRCOND: 13782 case ISD::SELECT: 13783 ExpectingFlags = true; 13784 break; 13785 case ISD::CopyToReg: 13786 case ISD::SIGN_EXTEND: 13787 case ISD::ZERO_EXTEND: 13788 case ISD::ANY_EXTEND: 13789 break; 13790 } 13791 13792 if (!ExpectingFlags) { 13793 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 13794 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 13795 13796 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 13797 X86::CondCode tmp = cc0; 13798 cc0 = cc1; 13799 cc1 = tmp; 13800 } 13801 13802 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 13803 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 13804 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 13805 X86ISD::NodeType NTOperator = is64BitFP ? 13806 X86ISD::FSETCCsd : X86ISD::FSETCCss; 13807 // FIXME: need symbolic constants for these magic numbers. 13808 // See X86ATTInstPrinter.cpp:printSSECC(). 13809 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 13810 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 13811 DAG.getConstant(x86cc, MVT::i8)); 13812 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 13813 OnesOrZeroesF); 13814 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 13815 DAG.getConstant(1, MVT::i32)); 13816 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 13817 return OneBitOfTruth; 13818 } 13819 } 13820 } 13821 } 13822 return SDValue(); 13823} 13824 13825/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 13826/// so it can be folded inside ANDNP. 13827static bool CanFoldXORWithAllOnes(const SDNode *N) { 13828 EVT VT = N->getValueType(0); 13829 13830 // Match direct AllOnes for 128 and 256-bit vectors 13831 if (ISD::isBuildVectorAllOnes(N)) 13832 return true; 13833 13834 // Look through a bit convert. 13835 if (N->getOpcode() == ISD::BITCAST) 13836 N = N->getOperand(0).getNode(); 13837 13838 // Sometimes the operand may come from a insert_subvector building a 256-bit 13839 // allones vector 13840 if (VT.getSizeInBits() == 256 && 13841 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 13842 SDValue V1 = N->getOperand(0); 13843 SDValue V2 = N->getOperand(1); 13844 13845 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 13846 V1.getOperand(0).getOpcode() == ISD::UNDEF && 13847 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 13848 ISD::isBuildVectorAllOnes(V2.getNode())) 13849 return true; 13850 } 13851 13852 return false; 13853} 13854 13855static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 13856 TargetLowering::DAGCombinerInfo &DCI, 13857 const X86Subtarget *Subtarget) { 13858 if (DCI.isBeforeLegalizeOps()) 13859 return SDValue(); 13860 13861 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 13862 if (R.getNode()) 13863 return R; 13864 13865 EVT VT = N->getValueType(0); 13866 13867 // Create ANDN, BLSI, and BLSR instructions 13868 // BLSI is X & (-X) 13869 // BLSR is X & (X-1) 13870 if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { 13871 SDValue N0 = N->getOperand(0); 13872 SDValue N1 = N->getOperand(1); 13873 DebugLoc DL = N->getDebugLoc(); 13874 13875 // Check LHS for not 13876 if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1))) 13877 return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1); 13878 // Check RHS for not 13879 if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1))) 13880 return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0); 13881 13882 // Check LHS for neg 13883 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && 13884 isZero(N0.getOperand(0))) 13885 return DAG.getNode(X86ISD::BLSI, DL, VT, N1); 13886 13887 // Check RHS for neg 13888 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && 13889 isZero(N1.getOperand(0))) 13890 return DAG.getNode(X86ISD::BLSI, DL, VT, N0); 13891 13892 // Check LHS for X-1 13893 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 13894 isAllOnes(N0.getOperand(1))) 13895 return DAG.getNode(X86ISD::BLSR, DL, VT, N1); 13896 13897 // Check RHS for X-1 13898 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 13899 isAllOnes(N1.getOperand(1))) 13900 return DAG.getNode(X86ISD::BLSR, DL, VT, N0); 13901 13902 return SDValue(); 13903 } 13904 13905 // Want to form ANDNP nodes: 13906 // 1) In the hopes of then easily combining them with OR and AND nodes 13907 // to form PBLEND/PSIGN. 13908 // 2) To match ANDN packed intrinsics 13909 if (VT != MVT::v2i64 && VT != MVT::v4i64) 13910 return SDValue(); 13911 13912 SDValue N0 = N->getOperand(0); 13913 SDValue N1 = N->getOperand(1); 13914 DebugLoc DL = N->getDebugLoc(); 13915 13916 // Check LHS for vnot 13917 if (N0.getOpcode() == ISD::XOR && 13918 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 13919 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 13920 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 13921 13922 // Check RHS for vnot 13923 if (N1.getOpcode() == ISD::XOR && 13924 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 13925 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 13926 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 13927 13928 return SDValue(); 13929} 13930 13931static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 13932 TargetLowering::DAGCombinerInfo &DCI, 13933 const X86Subtarget *Subtarget) { 13934 if (DCI.isBeforeLegalizeOps()) 13935 return SDValue(); 13936 13937 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 13938 if (R.getNode()) 13939 return R; 13940 13941 EVT VT = N->getValueType(0); 13942 13943 SDValue N0 = N->getOperand(0); 13944 SDValue N1 = N->getOperand(1); 13945 13946 // look for psign/blend 13947 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 13948 if (!Subtarget->hasSSSE3() || 13949 (VT == MVT::v4i64 && !Subtarget->hasAVX2())) 13950 return SDValue(); 13951 13952 // Canonicalize pandn to RHS 13953 if (N0.getOpcode() == X86ISD::ANDNP) 13954 std::swap(N0, N1); 13955 // or (and (m, y), (pandn m, x)) 13956 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 13957 SDValue Mask = N1.getOperand(0); 13958 SDValue X = N1.getOperand(1); 13959 SDValue Y; 13960 if (N0.getOperand(0) == Mask) 13961 Y = N0.getOperand(1); 13962 if (N0.getOperand(1) == Mask) 13963 Y = N0.getOperand(0); 13964 13965 // Check to see if the mask appeared in both the AND and ANDNP and 13966 if (!Y.getNode()) 13967 return SDValue(); 13968 13969 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 13970 if (Mask.getOpcode() != ISD::BITCAST || 13971 X.getOpcode() != ISD::BITCAST || 13972 Y.getOpcode() != ISD::BITCAST) 13973 return SDValue(); 13974 13975 // Look through mask bitcast. 13976 Mask = Mask.getOperand(0); 13977 EVT MaskVT = Mask.getValueType(); 13978 13979 // Validate that the Mask operand is a vector sra node. 13980 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 13981 // there is no psrai.b 13982 if (Mask.getOpcode() != X86ISD::VSRAI) 13983 return SDValue(); 13984 13985 // Check that the SRA is all signbits. 13986 SDValue SraC = Mask.getOperand(1); 13987 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 13988 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 13989 if ((SraAmt + 1) != EltBits) 13990 return SDValue(); 13991 13992 DebugLoc DL = N->getDebugLoc(); 13993 13994 // Now we know we at least have a plendvb with the mask val. See if 13995 // we can form a psignb/w/d. 13996 // psign = x.type == y.type == mask.type && y = sub(0, x); 13997 X = X.getOperand(0); 13998 Y = Y.getOperand(0); 13999 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 14000 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 14001 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 14002 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 14003 "Unsupported VT for PSIGN"); 14004 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); 14005 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 14006 } 14007 // PBLENDVB only available on SSE 4.1 14008 if (!Subtarget->hasSSE41()) 14009 return SDValue(); 14010 14011 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 14012 14013 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); 14014 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); 14015 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); 14016 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 14017 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 14018 } 14019 } 14020 14021 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 14022 return SDValue(); 14023 14024 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 14025 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 14026 std::swap(N0, N1); 14027 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 14028 return SDValue(); 14029 if (!N0.hasOneUse() || !N1.hasOneUse()) 14030 return SDValue(); 14031 14032 SDValue ShAmt0 = N0.getOperand(1); 14033 if (ShAmt0.getValueType() != MVT::i8) 14034 return SDValue(); 14035 SDValue ShAmt1 = N1.getOperand(1); 14036 if (ShAmt1.getValueType() != MVT::i8) 14037 return SDValue(); 14038 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 14039 ShAmt0 = ShAmt0.getOperand(0); 14040 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 14041 ShAmt1 = ShAmt1.getOperand(0); 14042 14043 DebugLoc DL = N->getDebugLoc(); 14044 unsigned Opc = X86ISD::SHLD; 14045 SDValue Op0 = N0.getOperand(0); 14046 SDValue Op1 = N1.getOperand(0); 14047 if (ShAmt0.getOpcode() == ISD::SUB) { 14048 Opc = X86ISD::SHRD; 14049 std::swap(Op0, Op1); 14050 std::swap(ShAmt0, ShAmt1); 14051 } 14052 14053 unsigned Bits = VT.getSizeInBits(); 14054 if (ShAmt1.getOpcode() == ISD::SUB) { 14055 SDValue Sum = ShAmt1.getOperand(0); 14056 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 14057 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 14058 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 14059 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 14060 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 14061 return DAG.getNode(Opc, DL, VT, 14062 Op0, Op1, 14063 DAG.getNode(ISD::TRUNCATE, DL, 14064 MVT::i8, ShAmt0)); 14065 } 14066 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 14067 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 14068 if (ShAmt0C && 14069 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 14070 return DAG.getNode(Opc, DL, VT, 14071 N0.getOperand(0), N1.getOperand(0), 14072 DAG.getNode(ISD::TRUNCATE, DL, 14073 MVT::i8, ShAmt0)); 14074 } 14075 14076 return SDValue(); 14077} 14078 14079// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes 14080static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 14081 TargetLowering::DAGCombinerInfo &DCI, 14082 const X86Subtarget *Subtarget) { 14083 if (DCI.isBeforeLegalizeOps()) 14084 return SDValue(); 14085 14086 EVT VT = N->getValueType(0); 14087 14088 if (VT != MVT::i32 && VT != MVT::i64) 14089 return SDValue(); 14090 14091 assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); 14092 14093 // Create BLSMSK instructions by finding X ^ (X-1) 14094 SDValue N0 = N->getOperand(0); 14095 SDValue N1 = N->getOperand(1); 14096 DebugLoc DL = N->getDebugLoc(); 14097 14098 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 14099 isAllOnes(N0.getOperand(1))) 14100 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); 14101 14102 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 14103 isAllOnes(N1.getOperand(1))) 14104 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); 14105 14106 return SDValue(); 14107} 14108 14109/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 14110static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 14111 const X86Subtarget *Subtarget) { 14112 LoadSDNode *Ld = cast<LoadSDNode>(N); 14113 EVT RegVT = Ld->getValueType(0); 14114 EVT MemVT = Ld->getMemoryVT(); 14115 DebugLoc dl = Ld->getDebugLoc(); 14116 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14117 14118 ISD::LoadExtType Ext = Ld->getExtensionType(); 14119 14120 // If this is a vector EXT Load then attempt to optimize it using a 14121 // shuffle. We need SSE4 for the shuffles. 14122 // TODO: It is possible to support ZExt by zeroing the undef values 14123 // during the shuffle phase or after the shuffle. 14124 if (RegVT.isVector() && RegVT.isInteger() && 14125 Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { 14126 assert(MemVT != RegVT && "Cannot extend to the same type"); 14127 assert(MemVT.isVector() && "Must load a vector from memory"); 14128 14129 unsigned NumElems = RegVT.getVectorNumElements(); 14130 unsigned RegSz = RegVT.getSizeInBits(); 14131 unsigned MemSz = MemVT.getSizeInBits(); 14132 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 14133 // All sizes must be a power of two 14134 if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue(); 14135 14136 // Attempt to load the original value using a single load op. 14137 // Find a scalar type which is equal to the loaded word size. 14138 MVT SclrLoadTy = MVT::i8; 14139 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 14140 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 14141 MVT Tp = (MVT::SimpleValueType)tp; 14142 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) { 14143 SclrLoadTy = Tp; 14144 break; 14145 } 14146 } 14147 14148 // Proceed if a load word is found. 14149 if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue(); 14150 14151 EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, 14152 RegSz/SclrLoadTy.getSizeInBits()); 14153 14154 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 14155 RegSz/MemVT.getScalarType().getSizeInBits()); 14156 // Can't shuffle using an illegal type. 14157 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 14158 14159 // Perform a single load. 14160 SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), 14161 Ld->getBasePtr(), 14162 Ld->getPointerInfo(), Ld->isVolatile(), 14163 Ld->isNonTemporal(), Ld->isInvariant(), 14164 Ld->getAlignment()); 14165 14166 // Insert the word loaded into a vector. 14167 SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 14168 LoadUnitVecVT, ScalarLoad); 14169 14170 // Bitcast the loaded value to a vector of the original element type, in 14171 // the size of the target vector type. 14172 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, 14173 ScalarInVector); 14174 unsigned SizeRatio = RegSz/MemSz; 14175 14176 // Redistribute the loaded elements into the different locations. 14177 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 14178 for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i; 14179 14180 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 14181 DAG.getUNDEF(SlicedVec.getValueType()), 14182 ShuffleVec.data()); 14183 14184 // Bitcast to the requested type. 14185 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 14186 // Replace the original load with the new sequence 14187 // and return the new chain. 14188 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff); 14189 return SDValue(ScalarLoad.getNode(), 1); 14190 } 14191 14192 return SDValue(); 14193} 14194 14195/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 14196static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 14197 const X86Subtarget *Subtarget) { 14198 StoreSDNode *St = cast<StoreSDNode>(N); 14199 EVT VT = St->getValue().getValueType(); 14200 EVT StVT = St->getMemoryVT(); 14201 DebugLoc dl = St->getDebugLoc(); 14202 SDValue StoredVal = St->getOperand(1); 14203 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14204 14205 // If we are saving a concatenation of two XMM registers, perform two stores. 14206 // This is better in Sandy Bridge cause one 256-bit mem op is done via two 14207 // 128-bit ones. If in the future the cost becomes only one memory access the 14208 // first version would be better. 14209 if (VT.getSizeInBits() == 256 && 14210 StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && 14211 StoredVal.getNumOperands() == 2) { 14212 14213 SDValue Value0 = StoredVal.getOperand(0); 14214 SDValue Value1 = StoredVal.getOperand(1); 14215 14216 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 14217 SDValue Ptr0 = St->getBasePtr(); 14218 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 14219 14220 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 14221 St->getPointerInfo(), St->isVolatile(), 14222 St->isNonTemporal(), St->getAlignment()); 14223 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 14224 St->getPointerInfo(), St->isVolatile(), 14225 St->isNonTemporal(), St->getAlignment()); 14226 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 14227 } 14228 14229 // Optimize trunc store (of multiple scalars) to shuffle and store. 14230 // First, pack all of the elements in one place. Next, store to memory 14231 // in fewer chunks. 14232 if (St->isTruncatingStore() && VT.isVector()) { 14233 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14234 unsigned NumElems = VT.getVectorNumElements(); 14235 assert(StVT != VT && "Cannot truncate to the same type"); 14236 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 14237 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 14238 14239 // From, To sizes and ElemCount must be pow of two 14240 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 14241 // We are going to use the original vector elt for storing. 14242 // Accumulated smaller vector elements must be a multiple of the store size. 14243 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 14244 14245 unsigned SizeRatio = FromSz / ToSz; 14246 14247 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 14248 14249 // Create a type on which we perform the shuffle 14250 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 14251 StVT.getScalarType(), NumElems*SizeRatio); 14252 14253 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 14254 14255 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 14256 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 14257 for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio; 14258 14259 // Can't shuffle using an illegal type 14260 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 14261 14262 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 14263 DAG.getUNDEF(WideVec.getValueType()), 14264 ShuffleVec.data()); 14265 // At this point all of the data is stored at the bottom of the 14266 // register. We now need to save it to mem. 14267 14268 // Find the largest store unit 14269 MVT StoreType = MVT::i8; 14270 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 14271 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 14272 MVT Tp = (MVT::SimpleValueType)tp; 14273 if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz) 14274 StoreType = Tp; 14275 } 14276 14277 // Bitcast the original vector into a vector of store-size units 14278 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 14279 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 14280 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 14281 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 14282 SmallVector<SDValue, 8> Chains; 14283 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 14284 TLI.getPointerTy()); 14285 SDValue Ptr = St->getBasePtr(); 14286 14287 // Perform one or more big stores into memory. 14288 for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) { 14289 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 14290 StoreType, ShuffWide, 14291 DAG.getIntPtrConstant(i)); 14292 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 14293 St->getPointerInfo(), St->isVolatile(), 14294 St->isNonTemporal(), St->getAlignment()); 14295 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 14296 Chains.push_back(Ch); 14297 } 14298 14299 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 14300 Chains.size()); 14301 } 14302 14303 14304 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 14305 // the FP state in cases where an emms may be missing. 14306 // A preferable solution to the general problem is to figure out the right 14307 // places to insert EMMS. This qualifies as a quick hack. 14308 14309 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 14310 if (VT.getSizeInBits() != 64) 14311 return SDValue(); 14312 14313 const Function *F = DAG.getMachineFunction().getFunction(); 14314 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 14315 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps 14316 && Subtarget->hasSSE2(); 14317 if ((VT.isVector() || 14318 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 14319 isa<LoadSDNode>(St->getValue()) && 14320 !cast<LoadSDNode>(St->getValue())->isVolatile() && 14321 St->getChain().hasOneUse() && !St->isVolatile()) { 14322 SDNode* LdVal = St->getValue().getNode(); 14323 LoadSDNode *Ld = 0; 14324 int TokenFactorIndex = -1; 14325 SmallVector<SDValue, 8> Ops; 14326 SDNode* ChainVal = St->getChain().getNode(); 14327 // Must be a store of a load. We currently handle two cases: the load 14328 // is a direct child, and it's under an intervening TokenFactor. It is 14329 // possible to dig deeper under nested TokenFactors. 14330 if (ChainVal == LdVal) 14331 Ld = cast<LoadSDNode>(St->getChain()); 14332 else if (St->getValue().hasOneUse() && 14333 ChainVal->getOpcode() == ISD::TokenFactor) { 14334 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 14335 if (ChainVal->getOperand(i).getNode() == LdVal) { 14336 TokenFactorIndex = i; 14337 Ld = cast<LoadSDNode>(St->getValue()); 14338 } else 14339 Ops.push_back(ChainVal->getOperand(i)); 14340 } 14341 } 14342 14343 if (!Ld || !ISD::isNormalLoad(Ld)) 14344 return SDValue(); 14345 14346 // If this is not the MMX case, i.e. we are just turning i64 load/store 14347 // into f64 load/store, avoid the transformation if there are multiple 14348 // uses of the loaded value. 14349 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 14350 return SDValue(); 14351 14352 DebugLoc LdDL = Ld->getDebugLoc(); 14353 DebugLoc StDL = N->getDebugLoc(); 14354 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 14355 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 14356 // pair instead. 14357 if (Subtarget->is64Bit() || F64IsLegal) { 14358 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 14359 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 14360 Ld->getPointerInfo(), Ld->isVolatile(), 14361 Ld->isNonTemporal(), Ld->isInvariant(), 14362 Ld->getAlignment()); 14363 SDValue NewChain = NewLd.getValue(1); 14364 if (TokenFactorIndex != -1) { 14365 Ops.push_back(NewChain); 14366 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 14367 Ops.size()); 14368 } 14369 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 14370 St->getPointerInfo(), 14371 St->isVolatile(), St->isNonTemporal(), 14372 St->getAlignment()); 14373 } 14374 14375 // Otherwise, lower to two pairs of 32-bit loads / stores. 14376 SDValue LoAddr = Ld->getBasePtr(); 14377 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 14378 DAG.getConstant(4, MVT::i32)); 14379 14380 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 14381 Ld->getPointerInfo(), 14382 Ld->isVolatile(), Ld->isNonTemporal(), 14383 Ld->isInvariant(), Ld->getAlignment()); 14384 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 14385 Ld->getPointerInfo().getWithOffset(4), 14386 Ld->isVolatile(), Ld->isNonTemporal(), 14387 Ld->isInvariant(), 14388 MinAlign(Ld->getAlignment(), 4)); 14389 14390 SDValue NewChain = LoLd.getValue(1); 14391 if (TokenFactorIndex != -1) { 14392 Ops.push_back(LoLd); 14393 Ops.push_back(HiLd); 14394 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 14395 Ops.size()); 14396 } 14397 14398 LoAddr = St->getBasePtr(); 14399 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 14400 DAG.getConstant(4, MVT::i32)); 14401 14402 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 14403 St->getPointerInfo(), 14404 St->isVolatile(), St->isNonTemporal(), 14405 St->getAlignment()); 14406 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 14407 St->getPointerInfo().getWithOffset(4), 14408 St->isVolatile(), 14409 St->isNonTemporal(), 14410 MinAlign(St->getAlignment(), 4)); 14411 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 14412 } 14413 return SDValue(); 14414} 14415 14416/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" 14417/// and return the operands for the horizontal operation in LHS and RHS. A 14418/// horizontal operation performs the binary operation on successive elements 14419/// of its first operand, then on successive elements of its second operand, 14420/// returning the resulting values in a vector. For example, if 14421/// A = < float a0, float a1, float a2, float a3 > 14422/// and 14423/// B = < float b0, float b1, float b2, float b3 > 14424/// then the result of doing a horizontal operation on A and B is 14425/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 14426/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 14427/// A horizontal-op B, for some already available A and B, and if so then LHS is 14428/// set to A, RHS to B, and the routine returns 'true'. 14429/// Note that the binary operation should have the property that if one of the 14430/// operands is UNDEF then the result is UNDEF. 14431static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 14432 // Look for the following pattern: if 14433 // A = < float a0, float a1, float a2, float a3 > 14434 // B = < float b0, float b1, float b2, float b3 > 14435 // and 14436 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 14437 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 14438 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 14439 // which is A horizontal-op B. 14440 14441 // At least one of the operands should be a vector shuffle. 14442 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 14443 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 14444 return false; 14445 14446 EVT VT = LHS.getValueType(); 14447 14448 assert((VT.is128BitVector() || VT.is256BitVector()) && 14449 "Unsupported vector type for horizontal add/sub"); 14450 14451 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 14452 // operate independently on 128-bit lanes. 14453 unsigned NumElts = VT.getVectorNumElements(); 14454 unsigned NumLanes = VT.getSizeInBits()/128; 14455 unsigned NumLaneElts = NumElts / NumLanes; 14456 assert((NumLaneElts % 2 == 0) && 14457 "Vector type should have an even number of elements in each lane"); 14458 unsigned HalfLaneElts = NumLaneElts/2; 14459 14460 // View LHS in the form 14461 // LHS = VECTOR_SHUFFLE A, B, LMask 14462 // If LHS is not a shuffle then pretend it is the shuffle 14463 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 14464 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 14465 // type VT. 14466 SDValue A, B; 14467 SmallVector<int, 16> LMask(NumElts); 14468 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 14469 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 14470 A = LHS.getOperand(0); 14471 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 14472 B = LHS.getOperand(1); 14473 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 14474 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 14475 } else { 14476 if (LHS.getOpcode() != ISD::UNDEF) 14477 A = LHS; 14478 for (unsigned i = 0; i != NumElts; ++i) 14479 LMask[i] = i; 14480 } 14481 14482 // Likewise, view RHS in the form 14483 // RHS = VECTOR_SHUFFLE C, D, RMask 14484 SDValue C, D; 14485 SmallVector<int, 16> RMask(NumElts); 14486 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 14487 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 14488 C = RHS.getOperand(0); 14489 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 14490 D = RHS.getOperand(1); 14491 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 14492 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 14493 } else { 14494 if (RHS.getOpcode() != ISD::UNDEF) 14495 C = RHS; 14496 for (unsigned i = 0; i != NumElts; ++i) 14497 RMask[i] = i; 14498 } 14499 14500 // Check that the shuffles are both shuffling the same vectors. 14501 if (!(A == C && B == D) && !(A == D && B == C)) 14502 return false; 14503 14504 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 14505 if (!A.getNode() && !B.getNode()) 14506 return false; 14507 14508 // If A and B occur in reverse order in RHS, then "swap" them (which means 14509 // rewriting the mask). 14510 if (A != C) 14511 CommuteVectorShuffleMask(RMask, NumElts); 14512 14513 // At this point LHS and RHS are equivalent to 14514 // LHS = VECTOR_SHUFFLE A, B, LMask 14515 // RHS = VECTOR_SHUFFLE A, B, RMask 14516 // Check that the masks correspond to performing a horizontal operation. 14517 for (unsigned i = 0; i != NumElts; ++i) { 14518 int LIdx = LMask[i], RIdx = RMask[i]; 14519 14520 // Ignore any UNDEF components. 14521 if (LIdx < 0 || RIdx < 0 || 14522 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 14523 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 14524 continue; 14525 14526 // Check that successive elements are being operated on. If not, this is 14527 // not a horizontal operation. 14528 unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs 14529 unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; 14530 int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; 14531 if (!(LIdx == Index && RIdx == Index + 1) && 14532 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 14533 return false; 14534 } 14535 14536 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 14537 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 14538 return true; 14539} 14540 14541/// PerformFADDCombine - Do target-specific dag combines on floating point adds. 14542static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 14543 const X86Subtarget *Subtarget) { 14544 EVT VT = N->getValueType(0); 14545 SDValue LHS = N->getOperand(0); 14546 SDValue RHS = N->getOperand(1); 14547 14548 // Try to synthesize horizontal adds from adds of shuffles. 14549 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 14550 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 14551 isHorizontalBinOp(LHS, RHS, true)) 14552 return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); 14553 return SDValue(); 14554} 14555 14556/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. 14557static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 14558 const X86Subtarget *Subtarget) { 14559 EVT VT = N->getValueType(0); 14560 SDValue LHS = N->getOperand(0); 14561 SDValue RHS = N->getOperand(1); 14562 14563 // Try to synthesize horizontal subs from subs of shuffles. 14564 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 14565 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 14566 isHorizontalBinOp(LHS, RHS, false)) 14567 return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); 14568 return SDValue(); 14569} 14570 14571/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 14572/// X86ISD::FXOR nodes. 14573static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 14574 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 14575 // F[X]OR(0.0, x) -> x 14576 // F[X]OR(x, 0.0) -> x 14577 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 14578 if (C->getValueAPF().isPosZero()) 14579 return N->getOperand(1); 14580 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 14581 if (C->getValueAPF().isPosZero()) 14582 return N->getOperand(0); 14583 return SDValue(); 14584} 14585 14586/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 14587static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 14588 // FAND(0.0, x) -> 0.0 14589 // FAND(x, 0.0) -> 0.0 14590 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 14591 if (C->getValueAPF().isPosZero()) 14592 return N->getOperand(0); 14593 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 14594 if (C->getValueAPF().isPosZero()) 14595 return N->getOperand(1); 14596 return SDValue(); 14597} 14598 14599static SDValue PerformBTCombine(SDNode *N, 14600 SelectionDAG &DAG, 14601 TargetLowering::DAGCombinerInfo &DCI) { 14602 // BT ignores high bits in the bit index operand. 14603 SDValue Op1 = N->getOperand(1); 14604 if (Op1.hasOneUse()) { 14605 unsigned BitWidth = Op1.getValueSizeInBits(); 14606 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 14607 APInt KnownZero, KnownOne; 14608 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 14609 !DCI.isBeforeLegalizeOps()); 14610 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14611 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 14612 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 14613 DCI.CommitTargetLoweringOpt(TLO); 14614 } 14615 return SDValue(); 14616} 14617 14618static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 14619 SDValue Op = N->getOperand(0); 14620 if (Op.getOpcode() == ISD::BITCAST) 14621 Op = Op.getOperand(0); 14622 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 14623 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 14624 VT.getVectorElementType().getSizeInBits() == 14625 OpVT.getVectorElementType().getSizeInBits()) { 14626 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 14627 } 14628 return SDValue(); 14629} 14630 14631static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, 14632 TargetLowering::DAGCombinerInfo &DCI, 14633 const X86Subtarget *Subtarget) { 14634 if (!DCI.isBeforeLegalizeOps()) 14635 return SDValue(); 14636 14637 if (!Subtarget->hasAVX()) 14638 return SDValue(); 14639 14640 // Optimize vectors in AVX mode 14641 // Sign extend v8i16 to v8i32 and 14642 // v4i32 to v4i64 14643 // 14644 // Divide input vector into two parts 14645 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 14646 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 14647 // concat the vectors to original VT 14648 14649 EVT VT = N->getValueType(0); 14650 SDValue Op = N->getOperand(0); 14651 EVT OpVT = Op.getValueType(); 14652 DebugLoc dl = N->getDebugLoc(); 14653 14654 if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || 14655 (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { 14656 14657 unsigned NumElems = OpVT.getVectorNumElements(); 14658 SmallVector<int,8> ShufMask1(NumElems, -1); 14659 for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i; 14660 14661 SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), 14662 ShufMask1.data()); 14663 14664 SmallVector<int,8> ShufMask2(NumElems, -1); 14665 for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2; 14666 14667 SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), 14668 ShufMask2.data()); 14669 14670 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 14671 VT.getVectorNumElements()/2); 14672 14673 OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 14674 OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); 14675 14676 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 14677 } 14678 return SDValue(); 14679} 14680 14681static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, 14682 const X86Subtarget *Subtarget) { 14683 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 14684 // (and (i32 x86isd::setcc_carry), 1) 14685 // This eliminates the zext. This transformation is necessary because 14686 // ISD::SETCC is always legalized to i8. 14687 DebugLoc dl = N->getDebugLoc(); 14688 SDValue N0 = N->getOperand(0); 14689 EVT VT = N->getValueType(0); 14690 EVT OpVT = N0.getValueType(); 14691 14692 if (N0.getOpcode() == ISD::AND && 14693 N0.hasOneUse() && 14694 N0.getOperand(0).hasOneUse()) { 14695 SDValue N00 = N0.getOperand(0); 14696 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 14697 return SDValue(); 14698 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 14699 if (!C || C->getZExtValue() != 1) 14700 return SDValue(); 14701 return DAG.getNode(ISD::AND, dl, VT, 14702 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 14703 N00.getOperand(0), N00.getOperand(1)), 14704 DAG.getConstant(1, VT)); 14705 } 14706 // Optimize vectors in AVX mode: 14707 // 14708 // v8i16 -> v8i32 14709 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 14710 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 14711 // Concat upper and lower parts. 14712 // 14713 // v4i32 -> v4i64 14714 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 14715 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 14716 // Concat upper and lower parts. 14717 // 14718 if (Subtarget->hasAVX()) { 14719 14720 if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || 14721 ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { 14722 14723 SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); 14724 SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG); 14725 SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG); 14726 14727 EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 14728 VT.getVectorNumElements()/2); 14729 14730 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); 14731 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); 14732 14733 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 14734 } 14735 } 14736 14737 14738 return SDValue(); 14739} 14740 14741// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 14742static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 14743 unsigned X86CC = N->getConstantOperandVal(0); 14744 SDValue EFLAG = N->getOperand(1); 14745 DebugLoc DL = N->getDebugLoc(); 14746 14747 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 14748 // a zext and produces an all-ones bit which is more useful than 0/1 in some 14749 // cases. 14750 if (X86CC == X86::COND_B) 14751 return DAG.getNode(ISD::AND, DL, MVT::i8, 14752 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 14753 DAG.getConstant(X86CC, MVT::i8), EFLAG), 14754 DAG.getConstant(1, MVT::i8)); 14755 14756 return SDValue(); 14757} 14758 14759static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 14760 const X86TargetLowering *XTLI) { 14761 SDValue Op0 = N->getOperand(0); 14762 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 14763 // a 32-bit target where SSE doesn't support i64->FP operations. 14764 if (Op0.getOpcode() == ISD::LOAD) { 14765 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 14766 EVT VT = Ld->getValueType(0); 14767 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 14768 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 14769 !XTLI->getSubtarget()->is64Bit() && 14770 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 14771 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 14772 Ld->getChain(), Op0, DAG); 14773 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 14774 return FILDChain; 14775 } 14776 } 14777 return SDValue(); 14778} 14779 14780// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 14781static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 14782 X86TargetLowering::DAGCombinerInfo &DCI) { 14783 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 14784 // the result is either zero or one (depending on the input carry bit). 14785 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 14786 if (X86::isZeroNode(N->getOperand(0)) && 14787 X86::isZeroNode(N->getOperand(1)) && 14788 // We don't have a good way to replace an EFLAGS use, so only do this when 14789 // dead right now. 14790 SDValue(N, 1).use_empty()) { 14791 DebugLoc DL = N->getDebugLoc(); 14792 EVT VT = N->getValueType(0); 14793 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 14794 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 14795 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 14796 DAG.getConstant(X86::COND_B,MVT::i8), 14797 N->getOperand(2)), 14798 DAG.getConstant(1, VT)); 14799 return DCI.CombineTo(N, Res1, CarryOut); 14800 } 14801 14802 return SDValue(); 14803} 14804 14805// fold (add Y, (sete X, 0)) -> adc 0, Y 14806// (add Y, (setne X, 0)) -> sbb -1, Y 14807// (sub (sete X, 0), Y) -> sbb 0, Y 14808// (sub (setne X, 0), Y) -> adc -1, Y 14809static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 14810 DebugLoc DL = N->getDebugLoc(); 14811 14812 // Look through ZExts. 14813 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 14814 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 14815 return SDValue(); 14816 14817 SDValue SetCC = Ext.getOperand(0); 14818 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 14819 return SDValue(); 14820 14821 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 14822 if (CC != X86::COND_E && CC != X86::COND_NE) 14823 return SDValue(); 14824 14825 SDValue Cmp = SetCC.getOperand(1); 14826 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 14827 !X86::isZeroNode(Cmp.getOperand(1)) || 14828 !Cmp.getOperand(0).getValueType().isInteger()) 14829 return SDValue(); 14830 14831 SDValue CmpOp0 = Cmp.getOperand(0); 14832 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 14833 DAG.getConstant(1, CmpOp0.getValueType())); 14834 14835 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 14836 if (CC == X86::COND_NE) 14837 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 14838 DL, OtherVal.getValueType(), OtherVal, 14839 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 14840 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 14841 DL, OtherVal.getValueType(), OtherVal, 14842 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 14843} 14844 14845/// PerformADDCombine - Do target-specific dag combines on integer adds. 14846static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 14847 const X86Subtarget *Subtarget) { 14848 EVT VT = N->getValueType(0); 14849 SDValue Op0 = N->getOperand(0); 14850 SDValue Op1 = N->getOperand(1); 14851 14852 // Try to synthesize horizontal adds from adds of shuffles. 14853 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 14854 (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 14855 isHorizontalBinOp(Op0, Op1, true)) 14856 return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); 14857 14858 return OptimizeConditionalInDecrement(N, DAG); 14859} 14860 14861static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 14862 const X86Subtarget *Subtarget) { 14863 SDValue Op0 = N->getOperand(0); 14864 SDValue Op1 = N->getOperand(1); 14865 14866 // X86 can't encode an immediate LHS of a sub. See if we can push the 14867 // negation into a preceding instruction. 14868 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 14869 // If the RHS of the sub is a XOR with one use and a constant, invert the 14870 // immediate. Then add one to the LHS of the sub so we can turn 14871 // X-Y -> X+~Y+1, saving one register. 14872 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 14873 isa<ConstantSDNode>(Op1.getOperand(1))) { 14874 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 14875 EVT VT = Op0.getValueType(); 14876 SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, 14877 Op1.getOperand(0), 14878 DAG.getConstant(~XorC, VT)); 14879 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, 14880 DAG.getConstant(C->getAPIntValue()+1, VT)); 14881 } 14882 } 14883 14884 // Try to synthesize horizontal adds from adds of shuffles. 14885 EVT VT = N->getValueType(0); 14886 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 14887 (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 14888 isHorizontalBinOp(Op0, Op1, true)) 14889 return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); 14890 14891 return OptimizeConditionalInDecrement(N, DAG); 14892} 14893 14894SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 14895 DAGCombinerInfo &DCI) const { 14896 SelectionDAG &DAG = DCI.DAG; 14897 switch (N->getOpcode()) { 14898 default: break; 14899 case ISD::EXTRACT_VECTOR_ELT: 14900 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 14901 case ISD::VSELECT: 14902 case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); 14903 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 14904 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 14905 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 14906 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 14907 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 14908 case ISD::SHL: 14909 case ISD::SRA: 14910 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); 14911 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 14912 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 14913 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 14914 case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); 14915 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 14916 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 14917 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 14918 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 14919 case X86ISD::FXOR: 14920 case X86ISD::FOR: return PerformFORCombine(N, DAG); 14921 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 14922 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 14923 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 14924 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget); 14925 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); 14926 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); 14927 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 14928 case X86ISD::SHUFP: // Handle all target specific shuffles 14929 case X86ISD::PALIGN: 14930 case X86ISD::UNPCKH: 14931 case X86ISD::UNPCKL: 14932 case X86ISD::MOVHLPS: 14933 case X86ISD::MOVLHPS: 14934 case X86ISD::PSHUFD: 14935 case X86ISD::PSHUFHW: 14936 case X86ISD::PSHUFLW: 14937 case X86ISD::MOVSS: 14938 case X86ISD::MOVSD: 14939 case X86ISD::VPERMILP: 14940 case X86ISD::VPERM2X128: 14941 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 14942 } 14943 14944 return SDValue(); 14945} 14946 14947/// isTypeDesirableForOp - Return true if the target has native support for 14948/// the specified value type and it is 'desirable' to use the type for the 14949/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 14950/// instruction encodings are longer and some i16 instructions are slow. 14951bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 14952 if (!isTypeLegal(VT)) 14953 return false; 14954 if (VT != MVT::i16) 14955 return true; 14956 14957 switch (Opc) { 14958 default: 14959 return true; 14960 case ISD::LOAD: 14961 case ISD::SIGN_EXTEND: 14962 case ISD::ZERO_EXTEND: 14963 case ISD::ANY_EXTEND: 14964 case ISD::SHL: 14965 case ISD::SRL: 14966 case ISD::SUB: 14967 case ISD::ADD: 14968 case ISD::MUL: 14969 case ISD::AND: 14970 case ISD::OR: 14971 case ISD::XOR: 14972 return false; 14973 } 14974} 14975 14976/// IsDesirableToPromoteOp - This method query the target whether it is 14977/// beneficial for dag combiner to promote the specified node. If true, it 14978/// should return the desired promotion type by reference. 14979bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 14980 EVT VT = Op.getValueType(); 14981 if (VT != MVT::i16) 14982 return false; 14983 14984 bool Promote = false; 14985 bool Commute = false; 14986 switch (Op.getOpcode()) { 14987 default: break; 14988 case ISD::LOAD: { 14989 LoadSDNode *LD = cast<LoadSDNode>(Op); 14990 // If the non-extending load has a single use and it's not live out, then it 14991 // might be folded. 14992 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 14993 Op.hasOneUse()*/) { 14994 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 14995 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 14996 // The only case where we'd want to promote LOAD (rather then it being 14997 // promoted as an operand is when it's only use is liveout. 14998 if (UI->getOpcode() != ISD::CopyToReg) 14999 return false; 15000 } 15001 } 15002 Promote = true; 15003 break; 15004 } 15005 case ISD::SIGN_EXTEND: 15006 case ISD::ZERO_EXTEND: 15007 case ISD::ANY_EXTEND: 15008 Promote = true; 15009 break; 15010 case ISD::SHL: 15011 case ISD::SRL: { 15012 SDValue N0 = Op.getOperand(0); 15013 // Look out for (store (shl (load), x)). 15014 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 15015 return false; 15016 Promote = true; 15017 break; 15018 } 15019 case ISD::ADD: 15020 case ISD::MUL: 15021 case ISD::AND: 15022 case ISD::OR: 15023 case ISD::XOR: 15024 Commute = true; 15025 // fallthrough 15026 case ISD::SUB: { 15027 SDValue N0 = Op.getOperand(0); 15028 SDValue N1 = Op.getOperand(1); 15029 if (!Commute && MayFoldLoad(N1)) 15030 return false; 15031 // Avoid disabling potential load folding opportunities. 15032 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 15033 return false; 15034 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 15035 return false; 15036 Promote = true; 15037 } 15038 } 15039 15040 PVT = MVT::i32; 15041 return Promote; 15042} 15043 15044//===----------------------------------------------------------------------===// 15045// X86 Inline Assembly Support 15046//===----------------------------------------------------------------------===// 15047 15048namespace { 15049 // Helper to match a string separated by whitespace. 15050 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { 15051 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. 15052 15053 for (unsigned i = 0, e = args.size(); i != e; ++i) { 15054 StringRef piece(*args[i]); 15055 if (!s.startswith(piece)) // Check if the piece matches. 15056 return false; 15057 15058 s = s.substr(piece.size()); 15059 StringRef::size_type pos = s.find_first_not_of(" \t"); 15060 if (pos == 0) // We matched a prefix. 15061 return false; 15062 15063 s = s.substr(pos); 15064 } 15065 15066 return s.empty(); 15067 } 15068 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; 15069} 15070 15071bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 15072 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 15073 15074 std::string AsmStr = IA->getAsmString(); 15075 15076 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 15077 if (!Ty || Ty->getBitWidth() % 16 != 0) 15078 return false; 15079 15080 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 15081 SmallVector<StringRef, 4> AsmPieces; 15082 SplitString(AsmStr, AsmPieces, ";\n"); 15083 15084 switch (AsmPieces.size()) { 15085 default: return false; 15086 case 1: 15087 // FIXME: this should verify that we are targeting a 486 or better. If not, 15088 // we will turn this bswap into something that will be lowered to logical 15089 // ops instead of emitting the bswap asm. For now, we don't support 486 or 15090 // lower so don't worry about this. 15091 // bswap $0 15092 if (matchAsm(AsmPieces[0], "bswap", "$0") || 15093 matchAsm(AsmPieces[0], "bswapl", "$0") || 15094 matchAsm(AsmPieces[0], "bswapq", "$0") || 15095 matchAsm(AsmPieces[0], "bswap", "${0:q}") || 15096 matchAsm(AsmPieces[0], "bswapl", "${0:q}") || 15097 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { 15098 // No need to check constraints, nothing other than the equivalent of 15099 // "=r,0" would be valid here. 15100 return IntrinsicLowering::LowerToByteSwap(CI); 15101 } 15102 15103 // rorw $$8, ${0:w} --> llvm.bswap.i16 15104 if (CI->getType()->isIntegerTy(16) && 15105 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 15106 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || 15107 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { 15108 AsmPieces.clear(); 15109 const std::string &ConstraintsStr = IA->getConstraintString(); 15110 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 15111 std::sort(AsmPieces.begin(), AsmPieces.end()); 15112 if (AsmPieces.size() == 4 && 15113 AsmPieces[0] == "~{cc}" && 15114 AsmPieces[1] == "~{dirflag}" && 15115 AsmPieces[2] == "~{flags}" && 15116 AsmPieces[3] == "~{fpsr}") 15117 return IntrinsicLowering::LowerToByteSwap(CI); 15118 } 15119 break; 15120 case 3: 15121 if (CI->getType()->isIntegerTy(32) && 15122 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 15123 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && 15124 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && 15125 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { 15126 AsmPieces.clear(); 15127 const std::string &ConstraintsStr = IA->getConstraintString(); 15128 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 15129 std::sort(AsmPieces.begin(), AsmPieces.end()); 15130 if (AsmPieces.size() == 4 && 15131 AsmPieces[0] == "~{cc}" && 15132 AsmPieces[1] == "~{dirflag}" && 15133 AsmPieces[2] == "~{flags}" && 15134 AsmPieces[3] == "~{fpsr}") 15135 return IntrinsicLowering::LowerToByteSwap(CI); 15136 } 15137 15138 if (CI->getType()->isIntegerTy(64)) { 15139 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 15140 if (Constraints.size() >= 2 && 15141 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 15142 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 15143 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 15144 if (matchAsm(AsmPieces[0], "bswap", "%eax") && 15145 matchAsm(AsmPieces[1], "bswap", "%edx") && 15146 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) 15147 return IntrinsicLowering::LowerToByteSwap(CI); 15148 } 15149 } 15150 break; 15151 } 15152 return false; 15153} 15154 15155 15156 15157/// getConstraintType - Given a constraint letter, return the type of 15158/// constraint it is for this target. 15159X86TargetLowering::ConstraintType 15160X86TargetLowering::getConstraintType(const std::string &Constraint) const { 15161 if (Constraint.size() == 1) { 15162 switch (Constraint[0]) { 15163 case 'R': 15164 case 'q': 15165 case 'Q': 15166 case 'f': 15167 case 't': 15168 case 'u': 15169 case 'y': 15170 case 'x': 15171 case 'Y': 15172 case 'l': 15173 return C_RegisterClass; 15174 case 'a': 15175 case 'b': 15176 case 'c': 15177 case 'd': 15178 case 'S': 15179 case 'D': 15180 case 'A': 15181 return C_Register; 15182 case 'I': 15183 case 'J': 15184 case 'K': 15185 case 'L': 15186 case 'M': 15187 case 'N': 15188 case 'G': 15189 case 'C': 15190 case 'e': 15191 case 'Z': 15192 return C_Other; 15193 default: 15194 break; 15195 } 15196 } 15197 return TargetLowering::getConstraintType(Constraint); 15198} 15199 15200/// Examine constraint type and operand type and determine a weight value. 15201/// This object must already have been set up with the operand type 15202/// and the current alternative constraint selected. 15203TargetLowering::ConstraintWeight 15204 X86TargetLowering::getSingleConstraintMatchWeight( 15205 AsmOperandInfo &info, const char *constraint) const { 15206 ConstraintWeight weight = CW_Invalid; 15207 Value *CallOperandVal = info.CallOperandVal; 15208 // If we don't have a value, we can't do a match, 15209 // but allow it at the lowest weight. 15210 if (CallOperandVal == NULL) 15211 return CW_Default; 15212 Type *type = CallOperandVal->getType(); 15213 // Look at the constraint type. 15214 switch (*constraint) { 15215 default: 15216 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 15217 case 'R': 15218 case 'q': 15219 case 'Q': 15220 case 'a': 15221 case 'b': 15222 case 'c': 15223 case 'd': 15224 case 'S': 15225 case 'D': 15226 case 'A': 15227 if (CallOperandVal->getType()->isIntegerTy()) 15228 weight = CW_SpecificReg; 15229 break; 15230 case 'f': 15231 case 't': 15232 case 'u': 15233 if (type->isFloatingPointTy()) 15234 weight = CW_SpecificReg; 15235 break; 15236 case 'y': 15237 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 15238 weight = CW_SpecificReg; 15239 break; 15240 case 'x': 15241 case 'Y': 15242 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || 15243 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX())) 15244 weight = CW_Register; 15245 break; 15246 case 'I': 15247 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 15248 if (C->getZExtValue() <= 31) 15249 weight = CW_Constant; 15250 } 15251 break; 15252 case 'J': 15253 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 15254 if (C->getZExtValue() <= 63) 15255 weight = CW_Constant; 15256 } 15257 break; 15258 case 'K': 15259 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 15260 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 15261 weight = CW_Constant; 15262 } 15263 break; 15264 case 'L': 15265 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 15266 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 15267 weight = CW_Constant; 15268 } 15269 break; 15270 case 'M': 15271 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 15272 if (C->getZExtValue() <= 3) 15273 weight = CW_Constant; 15274 } 15275 break; 15276 case 'N': 15277 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 15278 if (C->getZExtValue() <= 0xff) 15279 weight = CW_Constant; 15280 } 15281 break; 15282 case 'G': 15283 case 'C': 15284 if (dyn_cast<ConstantFP>(CallOperandVal)) { 15285 weight = CW_Constant; 15286 } 15287 break; 15288 case 'e': 15289 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 15290 if ((C->getSExtValue() >= -0x80000000LL) && 15291 (C->getSExtValue() <= 0x7fffffffLL)) 15292 weight = CW_Constant; 15293 } 15294 break; 15295 case 'Z': 15296 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 15297 if (C->getZExtValue() <= 0xffffffff) 15298 weight = CW_Constant; 15299 } 15300 break; 15301 } 15302 return weight; 15303} 15304 15305/// LowerXConstraint - try to replace an X constraint, which matches anything, 15306/// with another that has more specific requirements based on the type of the 15307/// corresponding operand. 15308const char *X86TargetLowering:: 15309LowerXConstraint(EVT ConstraintVT) const { 15310 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 15311 // 'f' like normal targets. 15312 if (ConstraintVT.isFloatingPoint()) { 15313 if (Subtarget->hasSSE2()) 15314 return "Y"; 15315 if (Subtarget->hasSSE1()) 15316 return "x"; 15317 } 15318 15319 return TargetLowering::LowerXConstraint(ConstraintVT); 15320} 15321 15322/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 15323/// vector. If it is invalid, don't add anything to Ops. 15324void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 15325 std::string &Constraint, 15326 std::vector<SDValue>&Ops, 15327 SelectionDAG &DAG) const { 15328 SDValue Result(0, 0); 15329 15330 // Only support length 1 constraints for now. 15331 if (Constraint.length() > 1) return; 15332 15333 char ConstraintLetter = Constraint[0]; 15334 switch (ConstraintLetter) { 15335 default: break; 15336 case 'I': 15337 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15338 if (C->getZExtValue() <= 31) { 15339 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 15340 break; 15341 } 15342 } 15343 return; 15344 case 'J': 15345 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15346 if (C->getZExtValue() <= 63) { 15347 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 15348 break; 15349 } 15350 } 15351 return; 15352 case 'K': 15353 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15354 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 15355 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 15356 break; 15357 } 15358 } 15359 return; 15360 case 'N': 15361 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15362 if (C->getZExtValue() <= 255) { 15363 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 15364 break; 15365 } 15366 } 15367 return; 15368 case 'e': { 15369 // 32-bit signed value 15370 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15371 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 15372 C->getSExtValue())) { 15373 // Widen to 64 bits here to get it sign extended. 15374 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 15375 break; 15376 } 15377 // FIXME gcc accepts some relocatable values here too, but only in certain 15378 // memory models; it's complicated. 15379 } 15380 return; 15381 } 15382 case 'Z': { 15383 // 32-bit unsigned value 15384 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15385 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 15386 C->getZExtValue())) { 15387 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 15388 break; 15389 } 15390 } 15391 // FIXME gcc accepts some relocatable values here too, but only in certain 15392 // memory models; it's complicated. 15393 return; 15394 } 15395 case 'i': { 15396 // Literal immediates are always ok. 15397 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 15398 // Widen to 64 bits here to get it sign extended. 15399 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 15400 break; 15401 } 15402 15403 // In any sort of PIC mode addresses need to be computed at runtime by 15404 // adding in a register or some sort of table lookup. These can't 15405 // be used as immediates. 15406 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 15407 return; 15408 15409 // If we are in non-pic codegen mode, we allow the address of a global (with 15410 // an optional displacement) to be used with 'i'. 15411 GlobalAddressSDNode *GA = 0; 15412 int64_t Offset = 0; 15413 15414 // Match either (GA), (GA+C), (GA+C1+C2), etc. 15415 while (1) { 15416 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 15417 Offset += GA->getOffset(); 15418 break; 15419 } else if (Op.getOpcode() == ISD::ADD) { 15420 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 15421 Offset += C->getZExtValue(); 15422 Op = Op.getOperand(0); 15423 continue; 15424 } 15425 } else if (Op.getOpcode() == ISD::SUB) { 15426 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 15427 Offset += -C->getZExtValue(); 15428 Op = Op.getOperand(0); 15429 continue; 15430 } 15431 } 15432 15433 // Otherwise, this isn't something we can handle, reject it. 15434 return; 15435 } 15436 15437 const GlobalValue *GV = GA->getGlobal(); 15438 // If we require an extra load to get this address, as in PIC mode, we 15439 // can't accept it. 15440 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 15441 getTargetMachine()))) 15442 return; 15443 15444 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 15445 GA->getValueType(0), Offset); 15446 break; 15447 } 15448 } 15449 15450 if (Result.getNode()) { 15451 Ops.push_back(Result); 15452 return; 15453 } 15454 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 15455} 15456 15457std::pair<unsigned, const TargetRegisterClass*> 15458X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 15459 EVT VT) const { 15460 // First, see if this is a constraint that directly corresponds to an LLVM 15461 // register class. 15462 if (Constraint.size() == 1) { 15463 // GCC Constraint Letters 15464 switch (Constraint[0]) { 15465 default: break; 15466 // TODO: Slight differences here in allocation order and leaving 15467 // RIP in the class. Do they matter any more here than they do 15468 // in the normal allocation? 15469 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 15470 if (Subtarget->is64Bit()) { 15471 if (VT == MVT::i32 || VT == MVT::f32) 15472 return std::make_pair(0U, X86::GR32RegisterClass); 15473 else if (VT == MVT::i16) 15474 return std::make_pair(0U, X86::GR16RegisterClass); 15475 else if (VT == MVT::i8 || VT == MVT::i1) 15476 return std::make_pair(0U, X86::GR8RegisterClass); 15477 else if (VT == MVT::i64 || VT == MVT::f64) 15478 return std::make_pair(0U, X86::GR64RegisterClass); 15479 break; 15480 } 15481 // 32-bit fallthrough 15482 case 'Q': // Q_REGS 15483 if (VT == MVT::i32 || VT == MVT::f32) 15484 return std::make_pair(0U, X86::GR32_ABCDRegisterClass); 15485 else if (VT == MVT::i16) 15486 return std::make_pair(0U, X86::GR16_ABCDRegisterClass); 15487 else if (VT == MVT::i8 || VT == MVT::i1) 15488 return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass); 15489 else if (VT == MVT::i64) 15490 return std::make_pair(0U, X86::GR64_ABCDRegisterClass); 15491 break; 15492 case 'r': // GENERAL_REGS 15493 case 'l': // INDEX_REGS 15494 if (VT == MVT::i8 || VT == MVT::i1) 15495 return std::make_pair(0U, X86::GR8RegisterClass); 15496 if (VT == MVT::i16) 15497 return std::make_pair(0U, X86::GR16RegisterClass); 15498 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 15499 return std::make_pair(0U, X86::GR32RegisterClass); 15500 return std::make_pair(0U, X86::GR64RegisterClass); 15501 case 'R': // LEGACY_REGS 15502 if (VT == MVT::i8 || VT == MVT::i1) 15503 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 15504 if (VT == MVT::i16) 15505 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 15506 if (VT == MVT::i32 || !Subtarget->is64Bit()) 15507 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 15508 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 15509 case 'f': // FP Stack registers. 15510 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 15511 // value to the correct fpstack register class. 15512 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 15513 return std::make_pair(0U, X86::RFP32RegisterClass); 15514 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 15515 return std::make_pair(0U, X86::RFP64RegisterClass); 15516 return std::make_pair(0U, X86::RFP80RegisterClass); 15517 case 'y': // MMX_REGS if MMX allowed. 15518 if (!Subtarget->hasMMX()) break; 15519 return std::make_pair(0U, X86::VR64RegisterClass); 15520 case 'Y': // SSE_REGS if SSE2 allowed 15521 if (!Subtarget->hasSSE2()) break; 15522 // FALL THROUGH. 15523 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 15524 if (!Subtarget->hasSSE1()) break; 15525 15526 switch (VT.getSimpleVT().SimpleTy) { 15527 default: break; 15528 // Scalar SSE types. 15529 case MVT::f32: 15530 case MVT::i32: 15531 return std::make_pair(0U, X86::FR32RegisterClass); 15532 case MVT::f64: 15533 case MVT::i64: 15534 return std::make_pair(0U, X86::FR64RegisterClass); 15535 // Vector types. 15536 case MVT::v16i8: 15537 case MVT::v8i16: 15538 case MVT::v4i32: 15539 case MVT::v2i64: 15540 case MVT::v4f32: 15541 case MVT::v2f64: 15542 return std::make_pair(0U, X86::VR128RegisterClass); 15543 // AVX types. 15544 case MVT::v32i8: 15545 case MVT::v16i16: 15546 case MVT::v8i32: 15547 case MVT::v4i64: 15548 case MVT::v8f32: 15549 case MVT::v4f64: 15550 return std::make_pair(0U, X86::VR256RegisterClass); 15551 15552 } 15553 break; 15554 } 15555 } 15556 15557 // Use the default implementation in TargetLowering to convert the register 15558 // constraint into a member of a register class. 15559 std::pair<unsigned, const TargetRegisterClass*> Res; 15560 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 15561 15562 // Not found as a standard register? 15563 if (Res.second == 0) { 15564 // Map st(0) -> st(7) -> ST0 15565 if (Constraint.size() == 7 && Constraint[0] == '{' && 15566 tolower(Constraint[1]) == 's' && 15567 tolower(Constraint[2]) == 't' && 15568 Constraint[3] == '(' && 15569 (Constraint[4] >= '0' && Constraint[4] <= '7') && 15570 Constraint[5] == ')' && 15571 Constraint[6] == '}') { 15572 15573 Res.first = X86::ST0+Constraint[4]-'0'; 15574 Res.second = X86::RFP80RegisterClass; 15575 return Res; 15576 } 15577 15578 // GCC allows "st(0)" to be called just plain "st". 15579 if (StringRef("{st}").equals_lower(Constraint)) { 15580 Res.first = X86::ST0; 15581 Res.second = X86::RFP80RegisterClass; 15582 return Res; 15583 } 15584 15585 // flags -> EFLAGS 15586 if (StringRef("{flags}").equals_lower(Constraint)) { 15587 Res.first = X86::EFLAGS; 15588 Res.second = X86::CCRRegisterClass; 15589 return Res; 15590 } 15591 15592 // 'A' means EAX + EDX. 15593 if (Constraint == "A") { 15594 Res.first = X86::EAX; 15595 Res.second = X86::GR32_ADRegisterClass; 15596 return Res; 15597 } 15598 return Res; 15599 } 15600 15601 // Otherwise, check to see if this is a register class of the wrong value 15602 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 15603 // turn into {ax},{dx}. 15604 if (Res.second->hasType(VT)) 15605 return Res; // Correct type already, nothing to do. 15606 15607 // All of the single-register GCC register classes map their values onto 15608 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 15609 // really want an 8-bit or 32-bit register, map to the appropriate register 15610 // class and return the appropriate register. 15611 if (Res.second == X86::GR16RegisterClass) { 15612 if (VT == MVT::i8) { 15613 unsigned DestReg = 0; 15614 switch (Res.first) { 15615 default: break; 15616 case X86::AX: DestReg = X86::AL; break; 15617 case X86::DX: DestReg = X86::DL; break; 15618 case X86::CX: DestReg = X86::CL; break; 15619 case X86::BX: DestReg = X86::BL; break; 15620 } 15621 if (DestReg) { 15622 Res.first = DestReg; 15623 Res.second = X86::GR8RegisterClass; 15624 } 15625 } else if (VT == MVT::i32) { 15626 unsigned DestReg = 0; 15627 switch (Res.first) { 15628 default: break; 15629 case X86::AX: DestReg = X86::EAX; break; 15630 case X86::DX: DestReg = X86::EDX; break; 15631 case X86::CX: DestReg = X86::ECX; break; 15632 case X86::BX: DestReg = X86::EBX; break; 15633 case X86::SI: DestReg = X86::ESI; break; 15634 case X86::DI: DestReg = X86::EDI; break; 15635 case X86::BP: DestReg = X86::EBP; break; 15636 case X86::SP: DestReg = X86::ESP; break; 15637 } 15638 if (DestReg) { 15639 Res.first = DestReg; 15640 Res.second = X86::GR32RegisterClass; 15641 } 15642 } else if (VT == MVT::i64) { 15643 unsigned DestReg = 0; 15644 switch (Res.first) { 15645 default: break; 15646 case X86::AX: DestReg = X86::RAX; break; 15647 case X86::DX: DestReg = X86::RDX; break; 15648 case X86::CX: DestReg = X86::RCX; break; 15649 case X86::BX: DestReg = X86::RBX; break; 15650 case X86::SI: DestReg = X86::RSI; break; 15651 case X86::DI: DestReg = X86::RDI; break; 15652 case X86::BP: DestReg = X86::RBP; break; 15653 case X86::SP: DestReg = X86::RSP; break; 15654 } 15655 if (DestReg) { 15656 Res.first = DestReg; 15657 Res.second = X86::GR64RegisterClass; 15658 } 15659 } 15660 } else if (Res.second == X86::FR32RegisterClass || 15661 Res.second == X86::FR64RegisterClass || 15662 Res.second == X86::VR128RegisterClass) { 15663 // Handle references to XMM physical registers that got mapped into the 15664 // wrong class. This can happen with constraints like {xmm0} where the 15665 // target independent register mapper will just pick the first match it can 15666 // find, ignoring the required type. 15667 if (VT == MVT::f32) 15668 Res.second = X86::FR32RegisterClass; 15669 else if (VT == MVT::f64) 15670 Res.second = X86::FR64RegisterClass; 15671 else if (X86::VR128RegisterClass->hasType(VT)) 15672 Res.second = X86::VR128RegisterClass; 15673 } 15674 15675 return Res; 15676} 15677