X86ISelLowering.cpp revision 7782102c70fdfd48776f05099eb67dd268cfc222
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/MC/MCAsmInfo.h" 39#include "llvm/MC/MCContext.h" 40#include "llvm/MC/MCExpr.h" 41#include "llvm/MC/MCSymbol.h" 42#include "llvm/ADT/BitVector.h" 43#include "llvm/ADT/SmallSet.h" 44#include "llvm/ADT/Statistic.h" 45#include "llvm/ADT/StringExtras.h" 46#include "llvm/ADT/VariadicFunction.h" 47#include "llvm/ADT/VectorExtras.h" 48#include "llvm/Support/CallSite.h" 49#include "llvm/Support/Debug.h" 50#include "llvm/Support/Dwarf.h" 51#include "llvm/Support/ErrorHandling.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54#include "llvm/Target/TargetOptions.h" 55using namespace llvm; 56using namespace dwarf; 57 58STATISTIC(NumTailCalls, "Number of tail calls"); 59 60// Forward declarations. 61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 62 SDValue V2); 63 64static SDValue Insert128BitVector(SDValue Result, 65 SDValue Vec, 66 SDValue Idx, 67 SelectionDAG &DAG, 68 DebugLoc dl); 69 70static SDValue Extract128BitVector(SDValue Vec, 71 SDValue Idx, 72 SelectionDAG &DAG, 73 DebugLoc dl); 74 75/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 76/// sets things up to match to an AVX VEXTRACTF128 instruction or a 77/// simple subregister reference. Idx is an index in the 128 bits we 78/// want. It need not be aligned to a 128-bit bounday. That makes 79/// lowering EXTRACT_VECTOR_ELT operations easier. 80static SDValue Extract128BitVector(SDValue Vec, 81 SDValue Idx, 82 SelectionDAG &DAG, 83 DebugLoc dl) { 84 EVT VT = Vec.getValueType(); 85 assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); 86 EVT ElVT = VT.getVectorElementType(); 87 int Factor = VT.getSizeInBits()/128; 88 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 89 VT.getVectorNumElements()/Factor); 90 91 // Extract from UNDEF is UNDEF. 92 if (Vec.getOpcode() == ISD::UNDEF) 93 return DAG.getNode(ISD::UNDEF, dl, ResultVT); 94 95 if (isa<ConstantSDNode>(Idx)) { 96 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 97 98 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 99 // we can match to VEXTRACTF128. 100 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 101 102 // This is the index of the first element of the 128-bit chunk 103 // we want. 104 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 105 * ElemsPerChunk); 106 107 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 108 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 109 VecIdx); 110 111 return Result; 112 } 113 114 return SDValue(); 115} 116 117/// Generate a DAG to put 128-bits into a vector > 128 bits. This 118/// sets things up to match to an AVX VINSERTF128 instruction or a 119/// simple superregister reference. Idx is an index in the 128 bits 120/// we want. It need not be aligned to a 128-bit bounday. That makes 121/// lowering INSERT_VECTOR_ELT operations easier. 122static SDValue Insert128BitVector(SDValue Result, 123 SDValue Vec, 124 SDValue Idx, 125 SelectionDAG &DAG, 126 DebugLoc dl) { 127 if (isa<ConstantSDNode>(Idx)) { 128 EVT VT = Vec.getValueType(); 129 assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); 130 131 EVT ElVT = VT.getVectorElementType(); 132 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 133 EVT ResultVT = Result.getValueType(); 134 135 // Insert the relevant 128 bits. 136 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 137 138 // This is the index of the first element of the 128-bit chunk 139 // we want. 140 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 141 * ElemsPerChunk); 142 143 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 144 Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 145 VecIdx); 146 return Result; 147 } 148 149 return SDValue(); 150} 151 152static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 153 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 154 bool is64Bit = Subtarget->is64Bit(); 155 156 if (Subtarget->isTargetEnvMacho()) { 157 if (is64Bit) 158 return new X8664_MachoTargetObjectFile(); 159 return new TargetLoweringObjectFileMachO(); 160 } 161 162 if (Subtarget->isTargetELF()) 163 return new TargetLoweringObjectFileELF(); 164 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 165 return new TargetLoweringObjectFileCOFF(); 166 llvm_unreachable("unknown subtarget type"); 167} 168 169X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 170 : TargetLowering(TM, createTLOF(TM)) { 171 Subtarget = &TM.getSubtarget<X86Subtarget>(); 172 X86ScalarSSEf64 = Subtarget->hasXMMInt(); 173 X86ScalarSSEf32 = Subtarget->hasXMM(); 174 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 175 176 RegInfo = TM.getRegisterInfo(); 177 TD = getTargetData(); 178 179 // Set up the TargetLowering object. 180 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 181 182 // X86 is weird, it always uses i8 for shift amounts and setcc results. 183 setBooleanContents(ZeroOrOneBooleanContent); 184 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 185 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 186 187 // For 64-bit since we have so many registers use the ILP scheduler, for 188 // 32-bit code use the register pressure specific scheduling. 189 if (Subtarget->is64Bit()) 190 setSchedulingPreference(Sched::ILP); 191 else 192 setSchedulingPreference(Sched::RegPressure); 193 setStackPointerRegisterToSaveRestore(X86StackPtr); 194 195 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 196 // Setup Windows compiler runtime calls. 197 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 198 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 199 setLibcallName(RTLIB::SREM_I64, "_allrem"); 200 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 201 setLibcallName(RTLIB::MUL_I64, "_allmul"); 202 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 203 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 204 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 205 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 206 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 207 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 208 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 209 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 210 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 211 } 212 213 if (Subtarget->isTargetDarwin()) { 214 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 215 setUseUnderscoreSetJmp(false); 216 setUseUnderscoreLongJmp(false); 217 } else if (Subtarget->isTargetMingw()) { 218 // MS runtime is weird: it exports _setjmp, but longjmp! 219 setUseUnderscoreSetJmp(true); 220 setUseUnderscoreLongJmp(false); 221 } else { 222 setUseUnderscoreSetJmp(true); 223 setUseUnderscoreLongJmp(true); 224 } 225 226 // Set up the register classes. 227 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 228 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 229 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 230 if (Subtarget->is64Bit()) 231 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 232 233 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 234 235 // We don't accept any truncstore of integer registers. 236 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 237 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 238 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 239 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 240 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 241 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 242 243 // SETOEQ and SETUNE require checking two conditions. 244 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 245 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 246 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 247 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 248 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 249 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 250 251 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 252 // operation. 253 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 254 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 255 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 256 257 if (Subtarget->is64Bit()) { 258 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 259 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 260 } else if (!TM.Options.UseSoftFloat) { 261 // We have an algorithm for SSE2->double, and we turn this into a 262 // 64-bit FILD followed by conditional FADD for other targets. 263 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 264 // We have an algorithm for SSE2, and we turn this into a 64-bit 265 // FILD for other targets. 266 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 267 } 268 269 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 270 // this operation. 271 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 272 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 273 274 if (!TM.Options.UseSoftFloat) { 275 // SSE has no i16 to fp conversion, only i32 276 if (X86ScalarSSEf32) { 277 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 278 // f32 and f64 cases are Legal, f80 case is not 279 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 280 } else { 281 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 282 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 283 } 284 } else { 285 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 286 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 287 } 288 289 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 290 // are Legal, f80 is custom lowered. 291 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 292 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 293 294 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 295 // this operation. 296 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 297 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 298 299 if (X86ScalarSSEf32) { 300 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 301 // f32 and f64 cases are Legal, f80 case is not 302 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 303 } else { 304 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 305 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 306 } 307 308 // Handle FP_TO_UINT by promoting the destination to a larger signed 309 // conversion. 310 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 311 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 312 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 313 314 if (Subtarget->is64Bit()) { 315 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 316 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 317 } else if (!TM.Options.UseSoftFloat) { 318 // Since AVX is a superset of SSE3, only check for SSE here. 319 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 320 // Expand FP_TO_UINT into a select. 321 // FIXME: We would like to use a Custom expander here eventually to do 322 // the optimal thing for SSE vs. the default expansion in the legalizer. 323 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 324 else 325 // With SSE3 we can use fisttpll to convert to a signed i64; without 326 // SSE, we're stuck with a fistpll. 327 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 328 } 329 330 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 331 if (!X86ScalarSSEf64) { 332 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 333 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 334 if (Subtarget->is64Bit()) { 335 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 336 // Without SSE, i64->f64 goes through memory. 337 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 338 } 339 } 340 341 // Scalar integer divide and remainder are lowered to use operations that 342 // produce two results, to match the available instructions. This exposes 343 // the two-result form to trivial CSE, which is able to combine x/y and x%y 344 // into a single instruction. 345 // 346 // Scalar integer multiply-high is also lowered to use two-result 347 // operations, to match the available instructions. However, plain multiply 348 // (low) operations are left as Legal, as there are single-result 349 // instructions for this in x86. Using the two-result multiply instructions 350 // when both high and low results are needed must be arranged by dagcombine. 351 for (unsigned i = 0, e = 4; i != e; ++i) { 352 MVT VT = IntVTs[i]; 353 setOperationAction(ISD::MULHS, VT, Expand); 354 setOperationAction(ISD::MULHU, VT, Expand); 355 setOperationAction(ISD::SDIV, VT, Expand); 356 setOperationAction(ISD::UDIV, VT, Expand); 357 setOperationAction(ISD::SREM, VT, Expand); 358 setOperationAction(ISD::UREM, VT, Expand); 359 360 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 361 setOperationAction(ISD::ADDC, VT, Custom); 362 setOperationAction(ISD::ADDE, VT, Custom); 363 setOperationAction(ISD::SUBC, VT, Custom); 364 setOperationAction(ISD::SUBE, VT, Custom); 365 } 366 367 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 368 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 369 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 370 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 371 if (Subtarget->is64Bit()) 372 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 373 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 374 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 375 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 376 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 377 setOperationAction(ISD::FREM , MVT::f32 , Expand); 378 setOperationAction(ISD::FREM , MVT::f64 , Expand); 379 setOperationAction(ISD::FREM , MVT::f80 , Expand); 380 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 381 382 // Promote the i8 variants and force them on up to i32 which has a shorter 383 // encoding. 384 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 385 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 386 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 387 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 388 if (Subtarget->hasBMI()) { 389 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 390 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 391 if (Subtarget->is64Bit()) 392 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 393 } else { 394 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 395 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 396 if (Subtarget->is64Bit()) 397 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 398 } 399 400 if (Subtarget->hasLZCNT()) { 401 // When promoting the i8 variants, force them to i32 for a shorter 402 // encoding. 403 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 404 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 405 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 406 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 407 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 408 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 409 if (Subtarget->is64Bit()) 410 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 411 } else { 412 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 413 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 414 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 415 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 416 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 417 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 418 if (Subtarget->is64Bit()) { 419 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 420 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 421 } 422 } 423 424 if (Subtarget->hasPOPCNT()) { 425 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 426 } else { 427 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 428 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 429 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 430 if (Subtarget->is64Bit()) 431 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 432 } 433 434 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 435 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 436 437 // These should be promoted to a larger select which is supported. 438 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 439 // X86 wants to expand cmov itself. 440 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 441 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 442 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 443 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 444 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 445 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 446 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 447 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 448 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 449 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 450 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 451 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 452 if (Subtarget->is64Bit()) { 453 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 454 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 455 } 456 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 457 458 // Darwin ABI issue. 459 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 460 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 461 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 462 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 463 if (Subtarget->is64Bit()) 464 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 465 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 466 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 467 if (Subtarget->is64Bit()) { 468 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 469 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 470 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 471 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 472 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 473 } 474 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 475 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 476 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 477 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 478 if (Subtarget->is64Bit()) { 479 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 480 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 481 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 482 } 483 484 if (Subtarget->hasXMM()) 485 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 486 487 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 488 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 489 490 // On X86 and X86-64, atomic operations are lowered to locked instructions. 491 // Locked instructions, in turn, have implicit fence semantics (all memory 492 // operations are flushed before issuing the locked instruction, and they 493 // are not buffered), so we can fold away the common pattern of 494 // fence-atomic-fence. 495 setShouldFoldAtomicFences(true); 496 497 // Expand certain atomics 498 for (unsigned i = 0, e = 4; i != e; ++i) { 499 MVT VT = IntVTs[i]; 500 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 501 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 502 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 503 } 504 505 if (!Subtarget->is64Bit()) { 506 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); 507 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 508 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 509 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 510 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 511 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 512 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 513 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 514 } 515 516 if (Subtarget->hasCmpxchg16b()) { 517 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 518 } 519 520 // FIXME - use subtarget debug flags 521 if (!Subtarget->isTargetDarwin() && 522 !Subtarget->isTargetELF() && 523 !Subtarget->isTargetCygMing()) { 524 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 525 } 526 527 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 528 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 529 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 530 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 531 if (Subtarget->is64Bit()) { 532 setExceptionPointerRegister(X86::RAX); 533 setExceptionSelectorRegister(X86::RDX); 534 } else { 535 setExceptionPointerRegister(X86::EAX); 536 setExceptionSelectorRegister(X86::EDX); 537 } 538 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 539 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 540 541 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 542 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 543 544 setOperationAction(ISD::TRAP, MVT::Other, Legal); 545 546 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 547 setOperationAction(ISD::VASTART , MVT::Other, Custom); 548 setOperationAction(ISD::VAEND , MVT::Other, Expand); 549 if (Subtarget->is64Bit()) { 550 setOperationAction(ISD::VAARG , MVT::Other, Custom); 551 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 552 } else { 553 setOperationAction(ISD::VAARG , MVT::Other, Expand); 554 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 555 } 556 557 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 558 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 559 560 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 561 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 562 MVT::i64 : MVT::i32, Custom); 563 else if (TM.Options.EnableSegmentedStacks) 564 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 565 MVT::i64 : MVT::i32, Custom); 566 else 567 setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? 568 MVT::i64 : MVT::i32, Expand); 569 570 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 571 // f32 and f64 use SSE. 572 // Set up the FP register classes. 573 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 574 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 575 576 // Use ANDPD to simulate FABS. 577 setOperationAction(ISD::FABS , MVT::f64, Custom); 578 setOperationAction(ISD::FABS , MVT::f32, Custom); 579 580 // Use XORP to simulate FNEG. 581 setOperationAction(ISD::FNEG , MVT::f64, Custom); 582 setOperationAction(ISD::FNEG , MVT::f32, Custom); 583 584 // Use ANDPD and ORPD to simulate FCOPYSIGN. 585 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 586 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 587 588 // Lower this to FGETSIGNx86 plus an AND. 589 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 590 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 591 592 // We don't support sin/cos/fmod 593 setOperationAction(ISD::FSIN , MVT::f64, Expand); 594 setOperationAction(ISD::FCOS , MVT::f64, Expand); 595 setOperationAction(ISD::FSIN , MVT::f32, Expand); 596 setOperationAction(ISD::FCOS , MVT::f32, Expand); 597 598 // Expand FP immediates into loads from the stack, except for the special 599 // cases we handle. 600 addLegalFPImmediate(APFloat(+0.0)); // xorpd 601 addLegalFPImmediate(APFloat(+0.0f)); // xorps 602 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 603 // Use SSE for f32, x87 for f64. 604 // Set up the FP register classes. 605 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 606 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 607 608 // Use ANDPS to simulate FABS. 609 setOperationAction(ISD::FABS , MVT::f32, Custom); 610 611 // Use XORP to simulate FNEG. 612 setOperationAction(ISD::FNEG , MVT::f32, Custom); 613 614 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 615 616 // Use ANDPS and ORPS to simulate FCOPYSIGN. 617 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 618 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 619 620 // We don't support sin/cos/fmod 621 setOperationAction(ISD::FSIN , MVT::f32, Expand); 622 setOperationAction(ISD::FCOS , MVT::f32, Expand); 623 624 // Special cases we handle for FP constants. 625 addLegalFPImmediate(APFloat(+0.0f)); // xorps 626 addLegalFPImmediate(APFloat(+0.0)); // FLD0 627 addLegalFPImmediate(APFloat(+1.0)); // FLD1 628 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 629 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 630 631 if (!TM.Options.UnsafeFPMath) { 632 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 633 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 634 } 635 } else if (!TM.Options.UseSoftFloat) { 636 // f32 and f64 in x87. 637 // Set up the FP register classes. 638 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 639 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 640 641 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 642 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 643 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 644 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 645 646 if (!TM.Options.UnsafeFPMath) { 647 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 648 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 649 } 650 addLegalFPImmediate(APFloat(+0.0)); // FLD0 651 addLegalFPImmediate(APFloat(+1.0)); // FLD1 652 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 653 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 654 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 655 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 656 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 657 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 658 } 659 660 // We don't support FMA. 661 setOperationAction(ISD::FMA, MVT::f64, Expand); 662 setOperationAction(ISD::FMA, MVT::f32, Expand); 663 664 // Long double always uses X87. 665 if (!TM.Options.UseSoftFloat) { 666 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 667 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 668 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 669 { 670 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 671 addLegalFPImmediate(TmpFlt); // FLD0 672 TmpFlt.changeSign(); 673 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 674 675 bool ignored; 676 APFloat TmpFlt2(+1.0); 677 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 678 &ignored); 679 addLegalFPImmediate(TmpFlt2); // FLD1 680 TmpFlt2.changeSign(); 681 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 682 } 683 684 if (!TM.Options.UnsafeFPMath) { 685 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 686 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 687 } 688 689 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 690 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 691 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 692 setOperationAction(ISD::FRINT, MVT::f80, Expand); 693 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 694 setOperationAction(ISD::FMA, MVT::f80, Expand); 695 } 696 697 // Always use a library call for pow. 698 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 699 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 700 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 701 702 setOperationAction(ISD::FLOG, MVT::f80, Expand); 703 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 704 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 705 setOperationAction(ISD::FEXP, MVT::f80, Expand); 706 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 707 708 // First set operation action for all vector types to either promote 709 // (for widening) or expand (for scalarization). Then we will selectively 710 // turn on ones that can be effectively codegen'd. 711 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 712 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 713 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 714 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 715 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 716 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 717 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 718 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 719 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 720 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 721 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 722 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 723 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 724 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 725 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 726 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 727 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 728 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 729 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 730 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 731 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 732 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 733 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 734 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 735 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 736 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 737 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 738 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 739 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 740 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 741 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 742 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 743 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 744 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 745 setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); 746 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 747 setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); 748 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 749 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 750 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 751 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 752 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 753 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 754 setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand); 755 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 756 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 757 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 758 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 759 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 760 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 761 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 762 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 763 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 764 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 765 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 766 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 767 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 768 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 769 setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand); 770 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 771 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 772 setTruncStoreAction((MVT::SimpleValueType)VT, 773 (MVT::SimpleValueType)InnerVT, Expand); 774 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 775 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 776 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 777 } 778 779 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 780 // with -msoft-float, disable use of MMX as well. 781 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 782 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 783 // No operations on x86mmx supported, everything uses intrinsics. 784 } 785 786 // MMX-sized vectors (other than x86mmx) are expected to be expanded 787 // into smaller operations. 788 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 789 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 790 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 791 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 792 setOperationAction(ISD::AND, MVT::v8i8, Expand); 793 setOperationAction(ISD::AND, MVT::v4i16, Expand); 794 setOperationAction(ISD::AND, MVT::v2i32, Expand); 795 setOperationAction(ISD::AND, MVT::v1i64, Expand); 796 setOperationAction(ISD::OR, MVT::v8i8, Expand); 797 setOperationAction(ISD::OR, MVT::v4i16, Expand); 798 setOperationAction(ISD::OR, MVT::v2i32, Expand); 799 setOperationAction(ISD::OR, MVT::v1i64, Expand); 800 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 801 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 802 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 803 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 804 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 805 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 806 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 807 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 808 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 809 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 810 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 811 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 812 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 813 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 814 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 815 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 816 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 817 818 if (!TM.Options.UseSoftFloat && Subtarget->hasXMM()) { 819 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 820 821 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 822 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 823 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 824 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 825 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 826 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 827 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 828 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 829 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 830 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 831 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 832 setOperationAction(ISD::SETCC, MVT::v4f32, Custom); 833 } 834 835 if (!TM.Options.UseSoftFloat && Subtarget->hasXMMInt()) { 836 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 837 838 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 839 // registers cannot be used even for integer operations. 840 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 841 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 842 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 843 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 844 845 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 846 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 847 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 848 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 849 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 850 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 851 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 852 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 853 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 854 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 855 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 856 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 857 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 858 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 859 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 860 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 861 862 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 863 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 864 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 865 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 866 867 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 868 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 869 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 870 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 871 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 872 873 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 874 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 875 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 876 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 877 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 878 879 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 880 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 881 EVT VT = (MVT::SimpleValueType)i; 882 // Do not attempt to custom lower non-power-of-2 vectors 883 if (!isPowerOf2_32(VT.getVectorNumElements())) 884 continue; 885 // Do not attempt to custom lower non-128-bit vectors 886 if (!VT.is128BitVector()) 887 continue; 888 setOperationAction(ISD::BUILD_VECTOR, 889 VT.getSimpleVT().SimpleTy, Custom); 890 setOperationAction(ISD::VECTOR_SHUFFLE, 891 VT.getSimpleVT().SimpleTy, Custom); 892 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 893 VT.getSimpleVT().SimpleTy, Custom); 894 } 895 896 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 897 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 898 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 899 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 900 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 901 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 902 903 if (Subtarget->is64Bit()) { 904 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 905 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 906 } 907 908 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 909 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 910 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 911 EVT VT = SVT; 912 913 // Do not attempt to promote non-128-bit vectors 914 if (!VT.is128BitVector()) 915 continue; 916 917 setOperationAction(ISD::AND, SVT, Promote); 918 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 919 setOperationAction(ISD::OR, SVT, Promote); 920 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 921 setOperationAction(ISD::XOR, SVT, Promote); 922 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 923 setOperationAction(ISD::LOAD, SVT, Promote); 924 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 925 setOperationAction(ISD::SELECT, SVT, Promote); 926 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 927 } 928 929 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 930 931 // Custom lower v2i64 and v2f64 selects. 932 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 933 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 934 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 935 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 936 937 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 938 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 939 } 940 941 if (Subtarget->hasSSE41orAVX()) { 942 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 943 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 944 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 945 setOperationAction(ISD::FRINT, MVT::f32, Legal); 946 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 947 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 948 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 949 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 950 setOperationAction(ISD::FRINT, MVT::f64, Legal); 951 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 952 953 // FIXME: Do we need to handle scalar-to-vector here? 954 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 955 956 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 957 setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); 958 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 959 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 960 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 961 962 // i8 and i16 vectors are custom , because the source register and source 963 // source memory operand types are not the same width. f32 vectors are 964 // custom since the immediate controlling the insert encodes additional 965 // information. 966 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 967 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 968 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 969 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 970 971 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 972 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 973 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 974 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 975 976 // FIXME: these should be Legal but thats only for the case where 977 // the index is constant. For now custom expand to deal with that. 978 if (Subtarget->is64Bit()) { 979 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 980 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 981 } 982 } 983 984 if (Subtarget->hasXMMInt()) { 985 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 986 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 987 988 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 989 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 990 991 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 992 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 993 994 if (Subtarget->hasAVX2()) { 995 setOperationAction(ISD::SRL, MVT::v2i64, Legal); 996 setOperationAction(ISD::SRL, MVT::v4i32, Legal); 997 998 setOperationAction(ISD::SHL, MVT::v2i64, Legal); 999 setOperationAction(ISD::SHL, MVT::v4i32, Legal); 1000 1001 setOperationAction(ISD::SRA, MVT::v4i32, Legal); 1002 } else { 1003 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 1004 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 1005 1006 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 1007 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 1008 1009 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 1010 } 1011 } 1012 1013 if (Subtarget->hasSSE42orAVX()) 1014 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 1015 1016 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { 1017 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 1018 addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); 1019 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 1020 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 1021 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 1022 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 1023 1024 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 1025 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 1026 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 1027 1028 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 1029 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 1030 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 1031 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 1032 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 1033 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 1034 1035 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 1036 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 1037 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 1038 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1039 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1040 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 1041 1042 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 1043 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 1044 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 1045 1046 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom); 1047 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom); 1048 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 1049 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 1050 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); 1051 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); 1052 1053 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 1054 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 1055 1056 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 1057 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 1058 1059 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 1060 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 1061 1062 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 1063 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 1064 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 1065 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 1066 1067 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 1068 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 1069 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 1070 1071 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 1072 setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); 1073 setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); 1074 setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); 1075 1076 if (Subtarget->hasAVX2()) { 1077 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 1078 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 1079 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 1080 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 1081 1082 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 1083 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 1084 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 1085 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 1086 1087 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1088 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 1089 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 1090 // Don't lower v32i8 because there is no 128-bit byte mul 1091 1092 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 1093 1094 setOperationAction(ISD::SRL, MVT::v4i64, Legal); 1095 setOperationAction(ISD::SRL, MVT::v8i32, Legal); 1096 1097 setOperationAction(ISD::SHL, MVT::v4i64, Legal); 1098 setOperationAction(ISD::SHL, MVT::v8i32, Legal); 1099 1100 setOperationAction(ISD::SRA, MVT::v8i32, Legal); 1101 } else { 1102 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 1103 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 1104 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 1105 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 1106 1107 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 1108 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 1109 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 1110 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 1111 1112 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 1113 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 1114 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 1115 // Don't lower v32i8 because there is no 128-bit byte mul 1116 1117 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 1118 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 1119 1120 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 1121 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 1122 1123 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 1124 } 1125 1126 // Custom lower several nodes for 256-bit types. 1127 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1128 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { 1129 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 1130 EVT VT = SVT; 1131 1132 // Extract subvector is special because the value type 1133 // (result) is 128-bit but the source is 256-bit wide. 1134 if (VT.is128BitVector()) 1135 setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom); 1136 1137 // Do not attempt to custom lower other non-256-bit vectors 1138 if (!VT.is256BitVector()) 1139 continue; 1140 1141 setOperationAction(ISD::BUILD_VECTOR, SVT, Custom); 1142 setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom); 1143 setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom); 1144 setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom); 1145 setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom); 1146 setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom); 1147 } 1148 1149 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 1150 for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) { 1151 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 1152 EVT VT = SVT; 1153 1154 // Do not attempt to promote non-256-bit vectors 1155 if (!VT.is256BitVector()) 1156 continue; 1157 1158 setOperationAction(ISD::AND, SVT, Promote); 1159 AddPromotedToType (ISD::AND, SVT, MVT::v4i64); 1160 setOperationAction(ISD::OR, SVT, Promote); 1161 AddPromotedToType (ISD::OR, SVT, MVT::v4i64); 1162 setOperationAction(ISD::XOR, SVT, Promote); 1163 AddPromotedToType (ISD::XOR, SVT, MVT::v4i64); 1164 setOperationAction(ISD::LOAD, SVT, Promote); 1165 AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64); 1166 setOperationAction(ISD::SELECT, SVT, Promote); 1167 AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64); 1168 } 1169 } 1170 1171 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1172 // of this type with custom code. 1173 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1174 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { 1175 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, 1176 Custom); 1177 } 1178 1179 // We want to custom lower some of our intrinsics. 1180 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1181 1182 1183 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1184 // handle type legalization for these operations here. 1185 // 1186 // FIXME: We really should do custom legalization for addition and 1187 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1188 // than generic legalization for 64-bit multiplication-with-overflow, though. 1189 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1190 // Add/Sub/Mul with overflow operations are custom lowered. 1191 MVT VT = IntVTs[i]; 1192 setOperationAction(ISD::SADDO, VT, Custom); 1193 setOperationAction(ISD::UADDO, VT, Custom); 1194 setOperationAction(ISD::SSUBO, VT, Custom); 1195 setOperationAction(ISD::USUBO, VT, Custom); 1196 setOperationAction(ISD::SMULO, VT, Custom); 1197 setOperationAction(ISD::UMULO, VT, Custom); 1198 } 1199 1200 // There are no 8-bit 3-address imul/mul instructions 1201 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1202 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1203 1204 if (!Subtarget->is64Bit()) { 1205 // These libcalls are not available in 32-bit. 1206 setLibcallName(RTLIB::SHL_I128, 0); 1207 setLibcallName(RTLIB::SRL_I128, 0); 1208 setLibcallName(RTLIB::SRA_I128, 0); 1209 } 1210 1211 // We have target-specific dag combine patterns for the following nodes: 1212 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1213 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1214 setTargetDAGCombine(ISD::VSELECT); 1215 setTargetDAGCombine(ISD::SELECT); 1216 setTargetDAGCombine(ISD::SHL); 1217 setTargetDAGCombine(ISD::SRA); 1218 setTargetDAGCombine(ISD::SRL); 1219 setTargetDAGCombine(ISD::OR); 1220 setTargetDAGCombine(ISD::AND); 1221 setTargetDAGCombine(ISD::ADD); 1222 setTargetDAGCombine(ISD::FADD); 1223 setTargetDAGCombine(ISD::FSUB); 1224 setTargetDAGCombine(ISD::SUB); 1225 setTargetDAGCombine(ISD::LOAD); 1226 setTargetDAGCombine(ISD::STORE); 1227 setTargetDAGCombine(ISD::ZERO_EXTEND); 1228 setTargetDAGCombine(ISD::SINT_TO_FP); 1229 if (Subtarget->is64Bit()) 1230 setTargetDAGCombine(ISD::MUL); 1231 if (Subtarget->hasBMI()) 1232 setTargetDAGCombine(ISD::XOR); 1233 1234 computeRegisterProperties(); 1235 1236 // On Darwin, -Os means optimize for size without hurting performance, 1237 // do not reduce the limit. 1238 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1239 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1240 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1241 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1242 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1243 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1244 setPrefLoopAlignment(4); // 2^4 bytes. 1245 benefitFromCodePlacementOpt = true; 1246 1247 setPrefFunctionAlignment(4); // 2^4 bytes. 1248} 1249 1250 1251EVT X86TargetLowering::getSetCCResultType(EVT VT) const { 1252 if (!VT.isVector()) return MVT::i8; 1253 return VT.changeVectorElementTypeToInteger(); 1254} 1255 1256 1257/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1258/// the desired ByVal argument alignment. 1259static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1260 if (MaxAlign == 16) 1261 return; 1262 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1263 if (VTy->getBitWidth() == 128) 1264 MaxAlign = 16; 1265 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1266 unsigned EltAlign = 0; 1267 getMaxByValAlign(ATy->getElementType(), EltAlign); 1268 if (EltAlign > MaxAlign) 1269 MaxAlign = EltAlign; 1270 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1271 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1272 unsigned EltAlign = 0; 1273 getMaxByValAlign(STy->getElementType(i), EltAlign); 1274 if (EltAlign > MaxAlign) 1275 MaxAlign = EltAlign; 1276 if (MaxAlign == 16) 1277 break; 1278 } 1279 } 1280 return; 1281} 1282 1283/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1284/// function arguments in the caller parameter area. For X86, aggregates 1285/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1286/// are at 4-byte boundaries. 1287unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1288 if (Subtarget->is64Bit()) { 1289 // Max of 8 and alignment of type. 1290 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1291 if (TyAlign > 8) 1292 return TyAlign; 1293 return 8; 1294 } 1295 1296 unsigned Align = 4; 1297 if (Subtarget->hasXMM()) 1298 getMaxByValAlign(Ty, Align); 1299 return Align; 1300} 1301 1302/// getOptimalMemOpType - Returns the target specific optimal type for load 1303/// and store operations as a result of memset, memcpy, and memmove 1304/// lowering. If DstAlign is zero that means it's safe to destination 1305/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1306/// means there isn't a need to check it against alignment requirement, 1307/// probably because the source does not need to be loaded. If 1308/// 'IsZeroVal' is true, that means it's safe to return a 1309/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1310/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1311/// constant so it does not need to be loaded. 1312/// It returns EVT::Other if the type should be determined using generic 1313/// target-independent logic. 1314EVT 1315X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1316 unsigned DstAlign, unsigned SrcAlign, 1317 bool IsZeroVal, 1318 bool MemcpyStrSrc, 1319 MachineFunction &MF) const { 1320 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1321 // linux. This is because the stack realignment code can't handle certain 1322 // cases like PR2962. This should be removed when PR2962 is fixed. 1323 const Function *F = MF.getFunction(); 1324 if (IsZeroVal && 1325 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1326 if (Size >= 16 && 1327 (Subtarget->isUnalignedMemAccessFast() || 1328 ((DstAlign == 0 || DstAlign >= 16) && 1329 (SrcAlign == 0 || SrcAlign >= 16))) && 1330 Subtarget->getStackAlignment() >= 16) { 1331 if (Subtarget->hasAVX() && 1332 Subtarget->getStackAlignment() >= 32) 1333 return MVT::v8f32; 1334 if (Subtarget->hasXMMInt()) 1335 return MVT::v4i32; 1336 if (Subtarget->hasXMM()) 1337 return MVT::v4f32; 1338 } else if (!MemcpyStrSrc && Size >= 8 && 1339 !Subtarget->is64Bit() && 1340 Subtarget->getStackAlignment() >= 8 && 1341 Subtarget->hasXMMInt()) { 1342 // Do not use f64 to lower memcpy if source is string constant. It's 1343 // better to use i32 to avoid the loads. 1344 return MVT::f64; 1345 } 1346 } 1347 if (Subtarget->is64Bit() && Size >= 8) 1348 return MVT::i64; 1349 return MVT::i32; 1350} 1351 1352/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1353/// current function. The returned value is a member of the 1354/// MachineJumpTableInfo::JTEntryKind enum. 1355unsigned X86TargetLowering::getJumpTableEncoding() const { 1356 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1357 // symbol. 1358 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1359 Subtarget->isPICStyleGOT()) 1360 return MachineJumpTableInfo::EK_Custom32; 1361 1362 // Otherwise, use the normal jump table encoding heuristics. 1363 return TargetLowering::getJumpTableEncoding(); 1364} 1365 1366const MCExpr * 1367X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1368 const MachineBasicBlock *MBB, 1369 unsigned uid,MCContext &Ctx) const{ 1370 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1371 Subtarget->isPICStyleGOT()); 1372 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1373 // entries. 1374 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1375 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1376} 1377 1378/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1379/// jumptable. 1380SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1381 SelectionDAG &DAG) const { 1382 if (!Subtarget->is64Bit()) 1383 // This doesn't have DebugLoc associated with it, but is not really the 1384 // same as a Register. 1385 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1386 return Table; 1387} 1388 1389/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1390/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1391/// MCExpr. 1392const MCExpr *X86TargetLowering:: 1393getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1394 MCContext &Ctx) const { 1395 // X86-64 uses RIP relative addressing based on the jump table label. 1396 if (Subtarget->isPICStyleRIPRel()) 1397 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1398 1399 // Otherwise, the reference is relative to the PIC base. 1400 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1401} 1402 1403// FIXME: Why this routine is here? Move to RegInfo! 1404std::pair<const TargetRegisterClass*, uint8_t> 1405X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1406 const TargetRegisterClass *RRC = 0; 1407 uint8_t Cost = 1; 1408 switch (VT.getSimpleVT().SimpleTy) { 1409 default: 1410 return TargetLowering::findRepresentativeClass(VT); 1411 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1412 RRC = (Subtarget->is64Bit() 1413 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1414 break; 1415 case MVT::x86mmx: 1416 RRC = X86::VR64RegisterClass; 1417 break; 1418 case MVT::f32: case MVT::f64: 1419 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1420 case MVT::v4f32: case MVT::v2f64: 1421 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1422 case MVT::v4f64: 1423 RRC = X86::VR128RegisterClass; 1424 break; 1425 } 1426 return std::make_pair(RRC, Cost); 1427} 1428 1429bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1430 unsigned &Offset) const { 1431 if (!Subtarget->isTargetLinux()) 1432 return false; 1433 1434 if (Subtarget->is64Bit()) { 1435 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1436 Offset = 0x28; 1437 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1438 AddressSpace = 256; 1439 else 1440 AddressSpace = 257; 1441 } else { 1442 // %gs:0x14 on i386 1443 Offset = 0x14; 1444 AddressSpace = 256; 1445 } 1446 return true; 1447} 1448 1449 1450//===----------------------------------------------------------------------===// 1451// Return Value Calling Convention Implementation 1452//===----------------------------------------------------------------------===// 1453 1454#include "X86GenCallingConv.inc" 1455 1456bool 1457X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1458 MachineFunction &MF, bool isVarArg, 1459 const SmallVectorImpl<ISD::OutputArg> &Outs, 1460 LLVMContext &Context) const { 1461 SmallVector<CCValAssign, 16> RVLocs; 1462 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1463 RVLocs, Context); 1464 return CCInfo.CheckReturn(Outs, RetCC_X86); 1465} 1466 1467SDValue 1468X86TargetLowering::LowerReturn(SDValue Chain, 1469 CallingConv::ID CallConv, bool isVarArg, 1470 const SmallVectorImpl<ISD::OutputArg> &Outs, 1471 const SmallVectorImpl<SDValue> &OutVals, 1472 DebugLoc dl, SelectionDAG &DAG) const { 1473 MachineFunction &MF = DAG.getMachineFunction(); 1474 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1475 1476 SmallVector<CCValAssign, 16> RVLocs; 1477 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1478 RVLocs, *DAG.getContext()); 1479 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1480 1481 // Add the regs to the liveout set for the function. 1482 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1483 for (unsigned i = 0; i != RVLocs.size(); ++i) 1484 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1485 MRI.addLiveOut(RVLocs[i].getLocReg()); 1486 1487 SDValue Flag; 1488 1489 SmallVector<SDValue, 6> RetOps; 1490 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1491 // Operand #1 = Bytes To Pop 1492 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1493 MVT::i16)); 1494 1495 // Copy the result values into the output registers. 1496 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1497 CCValAssign &VA = RVLocs[i]; 1498 assert(VA.isRegLoc() && "Can only return in registers!"); 1499 SDValue ValToCopy = OutVals[i]; 1500 EVT ValVT = ValToCopy.getValueType(); 1501 1502 // If this is x86-64, and we disabled SSE, we can't return FP values, 1503 // or SSE or MMX vectors. 1504 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1505 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1506 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1507 report_fatal_error("SSE register return with SSE disabled"); 1508 } 1509 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1510 // llvm-gcc has never done it right and no one has noticed, so this 1511 // should be OK for now. 1512 if (ValVT == MVT::f64 && 1513 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1514 report_fatal_error("SSE2 register return with SSE2 disabled"); 1515 1516 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1517 // the RET instruction and handled by the FP Stackifier. 1518 if (VA.getLocReg() == X86::ST0 || 1519 VA.getLocReg() == X86::ST1) { 1520 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1521 // change the value to the FP stack register class. 1522 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1523 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1524 RetOps.push_back(ValToCopy); 1525 // Don't emit a copytoreg. 1526 continue; 1527 } 1528 1529 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1530 // which is returned in RAX / RDX. 1531 if (Subtarget->is64Bit()) { 1532 if (ValVT == MVT::x86mmx) { 1533 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1534 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1535 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1536 ValToCopy); 1537 // If we don't have SSE2 available, convert to v4f32 so the generated 1538 // register is legal. 1539 if (!Subtarget->hasXMMInt()) 1540 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1541 } 1542 } 1543 } 1544 1545 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1546 Flag = Chain.getValue(1); 1547 } 1548 1549 // The x86-64 ABI for returning structs by value requires that we copy 1550 // the sret argument into %rax for the return. We saved the argument into 1551 // a virtual register in the entry block, so now we copy the value out 1552 // and into %rax. 1553 if (Subtarget->is64Bit() && 1554 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1555 MachineFunction &MF = DAG.getMachineFunction(); 1556 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1557 unsigned Reg = FuncInfo->getSRetReturnReg(); 1558 assert(Reg && 1559 "SRetReturnReg should have been set in LowerFormalArguments()."); 1560 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1561 1562 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1563 Flag = Chain.getValue(1); 1564 1565 // RAX now acts like a return value. 1566 MRI.addLiveOut(X86::RAX); 1567 } 1568 1569 RetOps[0] = Chain; // Update chain. 1570 1571 // Add the flag if we have it. 1572 if (Flag.getNode()) 1573 RetOps.push_back(Flag); 1574 1575 return DAG.getNode(X86ISD::RET_FLAG, dl, 1576 MVT::Other, &RetOps[0], RetOps.size()); 1577} 1578 1579bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1580 if (N->getNumValues() != 1) 1581 return false; 1582 if (!N->hasNUsesOfValue(1, 0)) 1583 return false; 1584 1585 SDNode *Copy = *N->use_begin(); 1586 if (Copy->getOpcode() != ISD::CopyToReg && 1587 Copy->getOpcode() != ISD::FP_EXTEND) 1588 return false; 1589 1590 bool HasRet = false; 1591 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1592 UI != UE; ++UI) { 1593 if (UI->getOpcode() != X86ISD::RET_FLAG) 1594 return false; 1595 HasRet = true; 1596 } 1597 1598 return HasRet; 1599} 1600 1601EVT 1602X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1603 ISD::NodeType ExtendKind) const { 1604 MVT ReturnMVT; 1605 // TODO: Is this also valid on 32-bit? 1606 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1607 ReturnMVT = MVT::i8; 1608 else 1609 ReturnMVT = MVT::i32; 1610 1611 EVT MinVT = getRegisterType(Context, ReturnMVT); 1612 return VT.bitsLT(MinVT) ? MinVT : VT; 1613} 1614 1615/// LowerCallResult - Lower the result values of a call into the 1616/// appropriate copies out of appropriate physical registers. 1617/// 1618SDValue 1619X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1620 CallingConv::ID CallConv, bool isVarArg, 1621 const SmallVectorImpl<ISD::InputArg> &Ins, 1622 DebugLoc dl, SelectionDAG &DAG, 1623 SmallVectorImpl<SDValue> &InVals) const { 1624 1625 // Assign locations to each value returned by this call. 1626 SmallVector<CCValAssign, 16> RVLocs; 1627 bool Is64Bit = Subtarget->is64Bit(); 1628 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1629 getTargetMachine(), RVLocs, *DAG.getContext()); 1630 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1631 1632 // Copy all of the result registers out of their specified physreg. 1633 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1634 CCValAssign &VA = RVLocs[i]; 1635 EVT CopyVT = VA.getValVT(); 1636 1637 // If this is x86-64, and we disabled SSE, we can't return FP values 1638 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1639 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1640 report_fatal_error("SSE register return with SSE disabled"); 1641 } 1642 1643 SDValue Val; 1644 1645 // If this is a call to a function that returns an fp value on the floating 1646 // point stack, we must guarantee the the value is popped from the stack, so 1647 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1648 // if the return value is not used. We use the FpPOP_RETVAL instruction 1649 // instead. 1650 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1651 // If we prefer to use the value in xmm registers, copy it out as f80 and 1652 // use a truncate to move it from fp stack reg to xmm reg. 1653 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1654 SDValue Ops[] = { Chain, InFlag }; 1655 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1656 MVT::Other, MVT::Glue, Ops, 2), 1); 1657 Val = Chain.getValue(0); 1658 1659 // Round the f80 to the right size, which also moves it to the appropriate 1660 // xmm register. 1661 if (CopyVT != VA.getValVT()) 1662 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1663 // This truncation won't change the value. 1664 DAG.getIntPtrConstant(1)); 1665 } else { 1666 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1667 CopyVT, InFlag).getValue(1); 1668 Val = Chain.getValue(0); 1669 } 1670 InFlag = Chain.getValue(2); 1671 InVals.push_back(Val); 1672 } 1673 1674 return Chain; 1675} 1676 1677 1678//===----------------------------------------------------------------------===// 1679// C & StdCall & Fast Calling Convention implementation 1680//===----------------------------------------------------------------------===// 1681// StdCall calling convention seems to be standard for many Windows' API 1682// routines and around. It differs from C calling convention just a little: 1683// callee should clean up the stack, not caller. Symbols should be also 1684// decorated in some fancy way :) It doesn't support any vector arguments. 1685// For info on fast calling convention see Fast Calling Convention (tail call) 1686// implementation LowerX86_32FastCCCallTo. 1687 1688/// CallIsStructReturn - Determines whether a call uses struct return 1689/// semantics. 1690static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1691 if (Outs.empty()) 1692 return false; 1693 1694 return Outs[0].Flags.isSRet(); 1695} 1696 1697/// ArgsAreStructReturn - Determines whether a function uses struct 1698/// return semantics. 1699static bool 1700ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1701 if (Ins.empty()) 1702 return false; 1703 1704 return Ins[0].Flags.isSRet(); 1705} 1706 1707/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1708/// by "Src" to address "Dst" with size and alignment information specified by 1709/// the specific parameter attribute. The copy will be passed as a byval 1710/// function parameter. 1711static SDValue 1712CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1713 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1714 DebugLoc dl) { 1715 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1716 1717 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1718 /*isVolatile*/false, /*AlwaysInline=*/true, 1719 MachinePointerInfo(), MachinePointerInfo()); 1720} 1721 1722/// IsTailCallConvention - Return true if the calling convention is one that 1723/// supports tail call optimization. 1724static bool IsTailCallConvention(CallingConv::ID CC) { 1725 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1726} 1727 1728bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1729 if (!CI->isTailCall()) 1730 return false; 1731 1732 CallSite CS(CI); 1733 CallingConv::ID CalleeCC = CS.getCallingConv(); 1734 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1735 return false; 1736 1737 return true; 1738} 1739 1740/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1741/// a tailcall target by changing its ABI. 1742static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 1743 bool GuaranteedTailCallOpt) { 1744 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1745} 1746 1747SDValue 1748X86TargetLowering::LowerMemArgument(SDValue Chain, 1749 CallingConv::ID CallConv, 1750 const SmallVectorImpl<ISD::InputArg> &Ins, 1751 DebugLoc dl, SelectionDAG &DAG, 1752 const CCValAssign &VA, 1753 MachineFrameInfo *MFI, 1754 unsigned i) const { 1755 // Create the nodes corresponding to a load from this parameter slot. 1756 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1757 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, 1758 getTargetMachine().Options.GuaranteedTailCallOpt); 1759 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1760 EVT ValVT; 1761 1762 // If value is passed by pointer we have address passed instead of the value 1763 // itself. 1764 if (VA.getLocInfo() == CCValAssign::Indirect) 1765 ValVT = VA.getLocVT(); 1766 else 1767 ValVT = VA.getValVT(); 1768 1769 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1770 // changed with more analysis. 1771 // In case of tail call optimization mark all arguments mutable. Since they 1772 // could be overwritten by lowering of arguments in case of a tail call. 1773 if (Flags.isByVal()) { 1774 unsigned Bytes = Flags.getByValSize(); 1775 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1776 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1777 return DAG.getFrameIndex(FI, getPointerTy()); 1778 } else { 1779 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1780 VA.getLocMemOffset(), isImmutable); 1781 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1782 return DAG.getLoad(ValVT, dl, Chain, FIN, 1783 MachinePointerInfo::getFixedStack(FI), 1784 false, false, false, 0); 1785 } 1786} 1787 1788SDValue 1789X86TargetLowering::LowerFormalArguments(SDValue Chain, 1790 CallingConv::ID CallConv, 1791 bool isVarArg, 1792 const SmallVectorImpl<ISD::InputArg> &Ins, 1793 DebugLoc dl, 1794 SelectionDAG &DAG, 1795 SmallVectorImpl<SDValue> &InVals) 1796 const { 1797 MachineFunction &MF = DAG.getMachineFunction(); 1798 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1799 1800 const Function* Fn = MF.getFunction(); 1801 if (Fn->hasExternalLinkage() && 1802 Subtarget->isTargetCygMing() && 1803 Fn->getName() == "main") 1804 FuncInfo->setForceFramePointer(true); 1805 1806 MachineFrameInfo *MFI = MF.getFrameInfo(); 1807 bool Is64Bit = Subtarget->is64Bit(); 1808 bool IsWin64 = Subtarget->isTargetWin64(); 1809 1810 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1811 "Var args not supported with calling convention fastcc or ghc"); 1812 1813 // Assign locations to all of the incoming arguments. 1814 SmallVector<CCValAssign, 16> ArgLocs; 1815 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1816 ArgLocs, *DAG.getContext()); 1817 1818 // Allocate shadow area for Win64 1819 if (IsWin64) { 1820 CCInfo.AllocateStack(32, 8); 1821 } 1822 1823 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1824 1825 unsigned LastVal = ~0U; 1826 SDValue ArgValue; 1827 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1828 CCValAssign &VA = ArgLocs[i]; 1829 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1830 // places. 1831 assert(VA.getValNo() != LastVal && 1832 "Don't support value assigned to multiple locs yet"); 1833 (void)LastVal; 1834 LastVal = VA.getValNo(); 1835 1836 if (VA.isRegLoc()) { 1837 EVT RegVT = VA.getLocVT(); 1838 TargetRegisterClass *RC = NULL; 1839 if (RegVT == MVT::i32) 1840 RC = X86::GR32RegisterClass; 1841 else if (Is64Bit && RegVT == MVT::i64) 1842 RC = X86::GR64RegisterClass; 1843 else if (RegVT == MVT::f32) 1844 RC = X86::FR32RegisterClass; 1845 else if (RegVT == MVT::f64) 1846 RC = X86::FR64RegisterClass; 1847 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1848 RC = X86::VR256RegisterClass; 1849 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1850 RC = X86::VR128RegisterClass; 1851 else if (RegVT == MVT::x86mmx) 1852 RC = X86::VR64RegisterClass; 1853 else 1854 llvm_unreachable("Unknown argument type!"); 1855 1856 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1857 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1858 1859 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1860 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1861 // right size. 1862 if (VA.getLocInfo() == CCValAssign::SExt) 1863 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1864 DAG.getValueType(VA.getValVT())); 1865 else if (VA.getLocInfo() == CCValAssign::ZExt) 1866 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1867 DAG.getValueType(VA.getValVT())); 1868 else if (VA.getLocInfo() == CCValAssign::BCvt) 1869 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1870 1871 if (VA.isExtInLoc()) { 1872 // Handle MMX values passed in XMM regs. 1873 if (RegVT.isVector()) { 1874 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1875 ArgValue); 1876 } else 1877 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1878 } 1879 } else { 1880 assert(VA.isMemLoc()); 1881 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1882 } 1883 1884 // If value is passed via pointer - do a load. 1885 if (VA.getLocInfo() == CCValAssign::Indirect) 1886 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1887 MachinePointerInfo(), false, false, false, 0); 1888 1889 InVals.push_back(ArgValue); 1890 } 1891 1892 // The x86-64 ABI for returning structs by value requires that we copy 1893 // the sret argument into %rax for the return. Save the argument into 1894 // a virtual register so that we can access it from the return points. 1895 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1896 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1897 unsigned Reg = FuncInfo->getSRetReturnReg(); 1898 if (!Reg) { 1899 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1900 FuncInfo->setSRetReturnReg(Reg); 1901 } 1902 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1903 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1904 } 1905 1906 unsigned StackSize = CCInfo.getNextStackOffset(); 1907 // Align stack specially for tail calls. 1908 if (FuncIsMadeTailCallSafe(CallConv, 1909 MF.getTarget().Options.GuaranteedTailCallOpt)) 1910 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1911 1912 // If the function takes variable number of arguments, make a frame index for 1913 // the start of the first vararg value... for expansion of llvm.va_start. 1914 if (isVarArg) { 1915 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1916 CallConv != CallingConv::X86_ThisCall)) { 1917 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1918 } 1919 if (Is64Bit) { 1920 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1921 1922 // FIXME: We should really autogenerate these arrays 1923 static const unsigned GPR64ArgRegsWin64[] = { 1924 X86::RCX, X86::RDX, X86::R8, X86::R9 1925 }; 1926 static const unsigned GPR64ArgRegs64Bit[] = { 1927 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1928 }; 1929 static const unsigned XMMArgRegs64Bit[] = { 1930 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1931 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1932 }; 1933 const unsigned *GPR64ArgRegs; 1934 unsigned NumXMMRegs = 0; 1935 1936 if (IsWin64) { 1937 // The XMM registers which might contain var arg parameters are shadowed 1938 // in their paired GPR. So we only need to save the GPR to their home 1939 // slots. 1940 TotalNumIntRegs = 4; 1941 GPR64ArgRegs = GPR64ArgRegsWin64; 1942 } else { 1943 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1944 GPR64ArgRegs = GPR64ArgRegs64Bit; 1945 1946 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, 1947 TotalNumXMMRegs); 1948 } 1949 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1950 TotalNumIntRegs); 1951 1952 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1953 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1954 "SSE register cannot be used when SSE is disabled!"); 1955 assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && 1956 NoImplicitFloatOps) && 1957 "SSE register cannot be used when SSE is disabled!"); 1958 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 1959 !Subtarget->hasXMM()) 1960 // Kernel mode asks for SSE to be disabled, so don't push them 1961 // on the stack. 1962 TotalNumXMMRegs = 0; 1963 1964 if (IsWin64) { 1965 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1966 // Get to the caller-allocated home save location. Add 8 to account 1967 // for the return address. 1968 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1969 FuncInfo->setRegSaveFrameIndex( 1970 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1971 // Fixup to set vararg frame on shadow area (4 x i64). 1972 if (NumIntRegs < 4) 1973 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1974 } else { 1975 // For X86-64, if there are vararg parameters that are passed via 1976 // registers, then we must store them to their spots on the stack so 1977 // they may be loaded by deferencing the result of va_next. 1978 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1979 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1980 FuncInfo->setRegSaveFrameIndex( 1981 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1982 false)); 1983 } 1984 1985 // Store the integer parameter registers. 1986 SmallVector<SDValue, 8> MemOps; 1987 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1988 getPointerTy()); 1989 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1990 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1991 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1992 DAG.getIntPtrConstant(Offset)); 1993 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1994 X86::GR64RegisterClass); 1995 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1996 SDValue Store = 1997 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1998 MachinePointerInfo::getFixedStack( 1999 FuncInfo->getRegSaveFrameIndex(), Offset), 2000 false, false, 0); 2001 MemOps.push_back(Store); 2002 Offset += 8; 2003 } 2004 2005 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 2006 // Now store the XMM (fp + vector) parameter registers. 2007 SmallVector<SDValue, 11> SaveXMMOps; 2008 SaveXMMOps.push_back(Chain); 2009 2010 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 2011 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 2012 SaveXMMOps.push_back(ALVal); 2013 2014 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2015 FuncInfo->getRegSaveFrameIndex())); 2016 SaveXMMOps.push_back(DAG.getIntPtrConstant( 2017 FuncInfo->getVarArgsFPOffset())); 2018 2019 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 2020 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 2021 X86::VR128RegisterClass); 2022 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 2023 SaveXMMOps.push_back(Val); 2024 } 2025 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 2026 MVT::Other, 2027 &SaveXMMOps[0], SaveXMMOps.size())); 2028 } 2029 2030 if (!MemOps.empty()) 2031 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2032 &MemOps[0], MemOps.size()); 2033 } 2034 } 2035 2036 // Some CCs need callee pop. 2037 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2038 MF.getTarget().Options.GuaranteedTailCallOpt)) { 2039 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 2040 } else { 2041 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 2042 // If this is an sret function, the return should pop the hidden pointer. 2043 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 2044 FuncInfo->setBytesToPopOnReturn(4); 2045 } 2046 2047 if (!Is64Bit) { 2048 // RegSaveFrameIndex is X86-64 only. 2049 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 2050 if (CallConv == CallingConv::X86_FastCall || 2051 CallConv == CallingConv::X86_ThisCall) 2052 // fastcc functions can't have varargs. 2053 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 2054 } 2055 2056 FuncInfo->setArgumentStackSize(StackSize); 2057 2058 return Chain; 2059} 2060 2061SDValue 2062X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 2063 SDValue StackPtr, SDValue Arg, 2064 DebugLoc dl, SelectionDAG &DAG, 2065 const CCValAssign &VA, 2066 ISD::ArgFlagsTy Flags) const { 2067 unsigned LocMemOffset = VA.getLocMemOffset(); 2068 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 2069 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 2070 if (Flags.isByVal()) 2071 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 2072 2073 return DAG.getStore(Chain, dl, Arg, PtrOff, 2074 MachinePointerInfo::getStack(LocMemOffset), 2075 false, false, 0); 2076} 2077 2078/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 2079/// optimization is performed and it is required. 2080SDValue 2081X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 2082 SDValue &OutRetAddr, SDValue Chain, 2083 bool IsTailCall, bool Is64Bit, 2084 int FPDiff, DebugLoc dl) const { 2085 // Adjust the Return address stack slot. 2086 EVT VT = getPointerTy(); 2087 OutRetAddr = getReturnAddressFrameIndex(DAG); 2088 2089 // Load the "old" Return address. 2090 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 2091 false, false, false, 0); 2092 return SDValue(OutRetAddr.getNode(), 1); 2093} 2094 2095/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 2096/// optimization is performed and it is required (FPDiff!=0). 2097static SDValue 2098EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 2099 SDValue Chain, SDValue RetAddrFrIdx, 2100 bool Is64Bit, int FPDiff, DebugLoc dl) { 2101 // Store the return address to the appropriate stack slot. 2102 if (!FPDiff) return Chain; 2103 // Calculate the new stack slot for the return address. 2104 int SlotSize = Is64Bit ? 8 : 4; 2105 int NewReturnAddrFI = 2106 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 2107 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 2108 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 2109 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 2110 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 2111 false, false, 0); 2112 return Chain; 2113} 2114 2115SDValue 2116X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 2117 CallingConv::ID CallConv, bool isVarArg, 2118 bool &isTailCall, 2119 const SmallVectorImpl<ISD::OutputArg> &Outs, 2120 const SmallVectorImpl<SDValue> &OutVals, 2121 const SmallVectorImpl<ISD::InputArg> &Ins, 2122 DebugLoc dl, SelectionDAG &DAG, 2123 SmallVectorImpl<SDValue> &InVals) const { 2124 MachineFunction &MF = DAG.getMachineFunction(); 2125 bool Is64Bit = Subtarget->is64Bit(); 2126 bool IsWin64 = Subtarget->isTargetWin64(); 2127 bool IsStructRet = CallIsStructReturn(Outs); 2128 bool IsSibcall = false; 2129 2130 if (isTailCall) { 2131 // Check if it's really possible to do a tail call. 2132 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 2133 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 2134 Outs, OutVals, Ins, DAG); 2135 2136 // Sibcalls are automatically detected tailcalls which do not require 2137 // ABI changes. 2138 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 2139 IsSibcall = true; 2140 2141 if (isTailCall) 2142 ++NumTailCalls; 2143 } 2144 2145 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 2146 "Var args not supported with calling convention fastcc or ghc"); 2147 2148 // Analyze operands of the call, assigning locations to each operand. 2149 SmallVector<CCValAssign, 16> ArgLocs; 2150 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 2151 ArgLocs, *DAG.getContext()); 2152 2153 // Allocate shadow area for Win64 2154 if (IsWin64) { 2155 CCInfo.AllocateStack(32, 8); 2156 } 2157 2158 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2159 2160 // Get a count of how many bytes are to be pushed on the stack. 2161 unsigned NumBytes = CCInfo.getNextStackOffset(); 2162 if (IsSibcall) 2163 // This is a sibcall. The memory operands are available in caller's 2164 // own caller's stack. 2165 NumBytes = 0; 2166 else if (getTargetMachine().Options.GuaranteedTailCallOpt && 2167 IsTailCallConvention(CallConv)) 2168 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 2169 2170 int FPDiff = 0; 2171 if (isTailCall && !IsSibcall) { 2172 // Lower arguments at fp - stackoffset + fpdiff. 2173 unsigned NumBytesCallerPushed = 2174 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 2175 FPDiff = NumBytesCallerPushed - NumBytes; 2176 2177 // Set the delta of movement of the returnaddr stackslot. 2178 // But only set if delta is greater than previous delta. 2179 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2180 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2181 } 2182 2183 if (!IsSibcall) 2184 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2185 2186 SDValue RetAddrFrIdx; 2187 // Load return address for tail calls. 2188 if (isTailCall && FPDiff) 2189 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2190 Is64Bit, FPDiff, dl); 2191 2192 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2193 SmallVector<SDValue, 8> MemOpChains; 2194 SDValue StackPtr; 2195 2196 // Walk the register/memloc assignments, inserting copies/loads. In the case 2197 // of tail call optimization arguments are handle later. 2198 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2199 CCValAssign &VA = ArgLocs[i]; 2200 EVT RegVT = VA.getLocVT(); 2201 SDValue Arg = OutVals[i]; 2202 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2203 bool isByVal = Flags.isByVal(); 2204 2205 // Promote the value if needed. 2206 switch (VA.getLocInfo()) { 2207 default: llvm_unreachable("Unknown loc info!"); 2208 case CCValAssign::Full: break; 2209 case CCValAssign::SExt: 2210 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2211 break; 2212 case CCValAssign::ZExt: 2213 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2214 break; 2215 case CCValAssign::AExt: 2216 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 2217 // Special case: passing MMX values in XMM registers. 2218 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2219 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2220 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2221 } else 2222 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2223 break; 2224 case CCValAssign::BCvt: 2225 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2226 break; 2227 case CCValAssign::Indirect: { 2228 // Store the argument. 2229 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2230 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2231 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2232 MachinePointerInfo::getFixedStack(FI), 2233 false, false, 0); 2234 Arg = SpillSlot; 2235 break; 2236 } 2237 } 2238 2239 if (VA.isRegLoc()) { 2240 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2241 if (isVarArg && IsWin64) { 2242 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2243 // shadow reg if callee is a varargs function. 2244 unsigned ShadowReg = 0; 2245 switch (VA.getLocReg()) { 2246 case X86::XMM0: ShadowReg = X86::RCX; break; 2247 case X86::XMM1: ShadowReg = X86::RDX; break; 2248 case X86::XMM2: ShadowReg = X86::R8; break; 2249 case X86::XMM3: ShadowReg = X86::R9; break; 2250 } 2251 if (ShadowReg) 2252 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2253 } 2254 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2255 assert(VA.isMemLoc()); 2256 if (StackPtr.getNode() == 0) 2257 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2258 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2259 dl, DAG, VA, Flags)); 2260 } 2261 } 2262 2263 if (!MemOpChains.empty()) 2264 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2265 &MemOpChains[0], MemOpChains.size()); 2266 2267 // Build a sequence of copy-to-reg nodes chained together with token chain 2268 // and flag operands which copy the outgoing args into registers. 2269 SDValue InFlag; 2270 // Tail call byval lowering might overwrite argument registers so in case of 2271 // tail call optimization the copies to registers are lowered later. 2272 if (!isTailCall) 2273 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2274 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2275 RegsToPass[i].second, InFlag); 2276 InFlag = Chain.getValue(1); 2277 } 2278 2279 if (Subtarget->isPICStyleGOT()) { 2280 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2281 // GOT pointer. 2282 if (!isTailCall) { 2283 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2284 DAG.getNode(X86ISD::GlobalBaseReg, 2285 DebugLoc(), getPointerTy()), 2286 InFlag); 2287 InFlag = Chain.getValue(1); 2288 } else { 2289 // If we are tail calling and generating PIC/GOT style code load the 2290 // address of the callee into ECX. The value in ecx is used as target of 2291 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2292 // for tail calls on PIC/GOT architectures. Normally we would just put the 2293 // address of GOT into ebx and then call target@PLT. But for tail calls 2294 // ebx would be restored (since ebx is callee saved) before jumping to the 2295 // target@PLT. 2296 2297 // Note: The actual moving to ECX is done further down. 2298 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2299 if (G && !G->getGlobal()->hasHiddenVisibility() && 2300 !G->getGlobal()->hasProtectedVisibility()) 2301 Callee = LowerGlobalAddress(Callee, DAG); 2302 else if (isa<ExternalSymbolSDNode>(Callee)) 2303 Callee = LowerExternalSymbol(Callee, DAG); 2304 } 2305 } 2306 2307 if (Is64Bit && isVarArg && !IsWin64) { 2308 // From AMD64 ABI document: 2309 // For calls that may call functions that use varargs or stdargs 2310 // (prototype-less calls or calls to functions containing ellipsis (...) in 2311 // the declaration) %al is used as hidden argument to specify the number 2312 // of SSE registers used. The contents of %al do not need to match exactly 2313 // the number of registers, but must be an ubound on the number of SSE 2314 // registers used and is in the range 0 - 8 inclusive. 2315 2316 // Count the number of XMM registers allocated. 2317 static const unsigned XMMArgRegs[] = { 2318 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2319 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2320 }; 2321 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2322 assert((Subtarget->hasXMM() || !NumXMMRegs) 2323 && "SSE registers cannot be used when SSE is disabled"); 2324 2325 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2326 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2327 InFlag = Chain.getValue(1); 2328 } 2329 2330 2331 // For tail calls lower the arguments to the 'real' stack slot. 2332 if (isTailCall) { 2333 // Force all the incoming stack arguments to be loaded from the stack 2334 // before any new outgoing arguments are stored to the stack, because the 2335 // outgoing stack slots may alias the incoming argument stack slots, and 2336 // the alias isn't otherwise explicit. This is slightly more conservative 2337 // than necessary, because it means that each store effectively depends 2338 // on every argument instead of just those arguments it would clobber. 2339 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2340 2341 SmallVector<SDValue, 8> MemOpChains2; 2342 SDValue FIN; 2343 int FI = 0; 2344 // Do not flag preceding copytoreg stuff together with the following stuff. 2345 InFlag = SDValue(); 2346 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2347 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2348 CCValAssign &VA = ArgLocs[i]; 2349 if (VA.isRegLoc()) 2350 continue; 2351 assert(VA.isMemLoc()); 2352 SDValue Arg = OutVals[i]; 2353 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2354 // Create frame index. 2355 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2356 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2357 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2358 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2359 2360 if (Flags.isByVal()) { 2361 // Copy relative to framepointer. 2362 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2363 if (StackPtr.getNode() == 0) 2364 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2365 getPointerTy()); 2366 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2367 2368 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2369 ArgChain, 2370 Flags, DAG, dl)); 2371 } else { 2372 // Store relative to framepointer. 2373 MemOpChains2.push_back( 2374 DAG.getStore(ArgChain, dl, Arg, FIN, 2375 MachinePointerInfo::getFixedStack(FI), 2376 false, false, 0)); 2377 } 2378 } 2379 } 2380 2381 if (!MemOpChains2.empty()) 2382 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2383 &MemOpChains2[0], MemOpChains2.size()); 2384 2385 // Copy arguments to their registers. 2386 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2387 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2388 RegsToPass[i].second, InFlag); 2389 InFlag = Chain.getValue(1); 2390 } 2391 InFlag =SDValue(); 2392 2393 // Store the return address to the appropriate stack slot. 2394 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2395 FPDiff, dl); 2396 } 2397 2398 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2399 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2400 // In the 64-bit large code model, we have to make all calls 2401 // through a register, since the call instruction's 32-bit 2402 // pc-relative offset may not be large enough to hold the whole 2403 // address. 2404 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2405 // If the callee is a GlobalAddress node (quite common, every direct call 2406 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2407 // it. 2408 2409 // We should use extra load for direct calls to dllimported functions in 2410 // non-JIT mode. 2411 const GlobalValue *GV = G->getGlobal(); 2412 if (!GV->hasDLLImportLinkage()) { 2413 unsigned char OpFlags = 0; 2414 bool ExtraLoad = false; 2415 unsigned WrapperKind = ISD::DELETED_NODE; 2416 2417 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2418 // external symbols most go through the PLT in PIC mode. If the symbol 2419 // has hidden or protected visibility, or if it is static or local, then 2420 // we don't need to use the PLT - we can directly call it. 2421 if (Subtarget->isTargetELF() && 2422 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2423 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2424 OpFlags = X86II::MO_PLT; 2425 } else if (Subtarget->isPICStyleStubAny() && 2426 (GV->isDeclaration() || GV->isWeakForLinker()) && 2427 (!Subtarget->getTargetTriple().isMacOSX() || 2428 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2429 // PC-relative references to external symbols should go through $stub, 2430 // unless we're building with the leopard linker or later, which 2431 // automatically synthesizes these stubs. 2432 OpFlags = X86II::MO_DARWIN_STUB; 2433 } else if (Subtarget->isPICStyleRIPRel() && 2434 isa<Function>(GV) && 2435 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { 2436 // If the function is marked as non-lazy, generate an indirect call 2437 // which loads from the GOT directly. This avoids runtime overhead 2438 // at the cost of eager binding (and one extra byte of encoding). 2439 OpFlags = X86II::MO_GOTPCREL; 2440 WrapperKind = X86ISD::WrapperRIP; 2441 ExtraLoad = true; 2442 } 2443 2444 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2445 G->getOffset(), OpFlags); 2446 2447 // Add a wrapper if needed. 2448 if (WrapperKind != ISD::DELETED_NODE) 2449 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2450 // Add extra indirection if needed. 2451 if (ExtraLoad) 2452 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2453 MachinePointerInfo::getGOT(), 2454 false, false, false, 0); 2455 } 2456 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2457 unsigned char OpFlags = 0; 2458 2459 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2460 // external symbols should go through the PLT. 2461 if (Subtarget->isTargetELF() && 2462 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2463 OpFlags = X86II::MO_PLT; 2464 } else if (Subtarget->isPICStyleStubAny() && 2465 (!Subtarget->getTargetTriple().isMacOSX() || 2466 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2467 // PC-relative references to external symbols should go through $stub, 2468 // unless we're building with the leopard linker or later, which 2469 // automatically synthesizes these stubs. 2470 OpFlags = X86II::MO_DARWIN_STUB; 2471 } 2472 2473 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2474 OpFlags); 2475 } 2476 2477 // Returns a chain & a flag for retval copy to use. 2478 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2479 SmallVector<SDValue, 8> Ops; 2480 2481 if (!IsSibcall && isTailCall) { 2482 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2483 DAG.getIntPtrConstant(0, true), InFlag); 2484 InFlag = Chain.getValue(1); 2485 } 2486 2487 Ops.push_back(Chain); 2488 Ops.push_back(Callee); 2489 2490 if (isTailCall) 2491 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2492 2493 // Add argument registers to the end of the list so that they are known live 2494 // into the call. 2495 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2496 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2497 RegsToPass[i].second.getValueType())); 2498 2499 // Add an implicit use GOT pointer in EBX. 2500 if (!isTailCall && Subtarget->isPICStyleGOT()) 2501 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2502 2503 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2504 if (Is64Bit && isVarArg && !IsWin64) 2505 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2506 2507 if (InFlag.getNode()) 2508 Ops.push_back(InFlag); 2509 2510 if (isTailCall) { 2511 // We used to do: 2512 //// If this is the first return lowered for this function, add the regs 2513 //// to the liveout set for the function. 2514 // This isn't right, although it's probably harmless on x86; liveouts 2515 // should be computed from returns not tail calls. Consider a void 2516 // function making a tail call to a function returning int. 2517 return DAG.getNode(X86ISD::TC_RETURN, dl, 2518 NodeTys, &Ops[0], Ops.size()); 2519 } 2520 2521 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2522 InFlag = Chain.getValue(1); 2523 2524 // Create the CALLSEQ_END node. 2525 unsigned NumBytesForCalleeToPush; 2526 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 2527 getTargetMachine().Options.GuaranteedTailCallOpt)) 2528 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2529 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2530 // If this is a call to a struct-return function, the callee 2531 // pops the hidden struct pointer, so we have to push it back. 2532 // This is common for Darwin/X86, Linux & Mingw32 targets. 2533 NumBytesForCalleeToPush = 4; 2534 else 2535 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2536 2537 // Returns a flag for retval copy to use. 2538 if (!IsSibcall) { 2539 Chain = DAG.getCALLSEQ_END(Chain, 2540 DAG.getIntPtrConstant(NumBytes, true), 2541 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2542 true), 2543 InFlag); 2544 InFlag = Chain.getValue(1); 2545 } 2546 2547 // Handle result values, copying them out of physregs into vregs that we 2548 // return. 2549 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2550 Ins, dl, DAG, InVals); 2551} 2552 2553 2554//===----------------------------------------------------------------------===// 2555// Fast Calling Convention (tail call) implementation 2556//===----------------------------------------------------------------------===// 2557 2558// Like std call, callee cleans arguments, convention except that ECX is 2559// reserved for storing the tail called function address. Only 2 registers are 2560// free for argument passing (inreg). Tail call optimization is performed 2561// provided: 2562// * tailcallopt is enabled 2563// * caller/callee are fastcc 2564// On X86_64 architecture with GOT-style position independent code only local 2565// (within module) calls are supported at the moment. 2566// To keep the stack aligned according to platform abi the function 2567// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2568// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2569// If a tail called function callee has more arguments than the caller the 2570// caller needs to make sure that there is room to move the RETADDR to. This is 2571// achieved by reserving an area the size of the argument delta right after the 2572// original REtADDR, but before the saved framepointer or the spilled registers 2573// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2574// stack layout: 2575// arg1 2576// arg2 2577// RETADDR 2578// [ new RETADDR 2579// move area ] 2580// (possible EBP) 2581// ESI 2582// EDI 2583// local1 .. 2584 2585/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2586/// for a 16 byte align requirement. 2587unsigned 2588X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2589 SelectionDAG& DAG) const { 2590 MachineFunction &MF = DAG.getMachineFunction(); 2591 const TargetMachine &TM = MF.getTarget(); 2592 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2593 unsigned StackAlignment = TFI.getStackAlignment(); 2594 uint64_t AlignMask = StackAlignment - 1; 2595 int64_t Offset = StackSize; 2596 uint64_t SlotSize = TD->getPointerSize(); 2597 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2598 // Number smaller than 12 so just add the difference. 2599 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2600 } else { 2601 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2602 Offset = ((~AlignMask) & Offset) + StackAlignment + 2603 (StackAlignment-SlotSize); 2604 } 2605 return Offset; 2606} 2607 2608/// MatchingStackOffset - Return true if the given stack call argument is 2609/// already available in the same position (relatively) of the caller's 2610/// incoming argument stack. 2611static 2612bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2613 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2614 const X86InstrInfo *TII) { 2615 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2616 int FI = INT_MAX; 2617 if (Arg.getOpcode() == ISD::CopyFromReg) { 2618 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2619 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2620 return false; 2621 MachineInstr *Def = MRI->getVRegDef(VR); 2622 if (!Def) 2623 return false; 2624 if (!Flags.isByVal()) { 2625 if (!TII->isLoadFromStackSlot(Def, FI)) 2626 return false; 2627 } else { 2628 unsigned Opcode = Def->getOpcode(); 2629 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2630 Def->getOperand(1).isFI()) { 2631 FI = Def->getOperand(1).getIndex(); 2632 Bytes = Flags.getByValSize(); 2633 } else 2634 return false; 2635 } 2636 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2637 if (Flags.isByVal()) 2638 // ByVal argument is passed in as a pointer but it's now being 2639 // dereferenced. e.g. 2640 // define @foo(%struct.X* %A) { 2641 // tail call @bar(%struct.X* byval %A) 2642 // } 2643 return false; 2644 SDValue Ptr = Ld->getBasePtr(); 2645 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2646 if (!FINode) 2647 return false; 2648 FI = FINode->getIndex(); 2649 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2650 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2651 FI = FINode->getIndex(); 2652 Bytes = Flags.getByValSize(); 2653 } else 2654 return false; 2655 2656 assert(FI != INT_MAX); 2657 if (!MFI->isFixedObjectIndex(FI)) 2658 return false; 2659 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2660} 2661 2662/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2663/// for tail call optimization. Targets which want to do tail call 2664/// optimization should implement this function. 2665bool 2666X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2667 CallingConv::ID CalleeCC, 2668 bool isVarArg, 2669 bool isCalleeStructRet, 2670 bool isCallerStructRet, 2671 const SmallVectorImpl<ISD::OutputArg> &Outs, 2672 const SmallVectorImpl<SDValue> &OutVals, 2673 const SmallVectorImpl<ISD::InputArg> &Ins, 2674 SelectionDAG& DAG) const { 2675 if (!IsTailCallConvention(CalleeCC) && 2676 CalleeCC != CallingConv::C) 2677 return false; 2678 2679 // If -tailcallopt is specified, make fastcc functions tail-callable. 2680 const MachineFunction &MF = DAG.getMachineFunction(); 2681 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2682 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2683 bool CCMatch = CallerCC == CalleeCC; 2684 2685 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2686 if (IsTailCallConvention(CalleeCC) && CCMatch) 2687 return true; 2688 return false; 2689 } 2690 2691 // Look for obvious safe cases to perform tail call optimization that do not 2692 // require ABI changes. This is what gcc calls sibcall. 2693 2694 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2695 // emit a special epilogue. 2696 if (RegInfo->needsStackRealignment(MF)) 2697 return false; 2698 2699 // Also avoid sibcall optimization if either caller or callee uses struct 2700 // return semantics. 2701 if (isCalleeStructRet || isCallerStructRet) 2702 return false; 2703 2704 // An stdcall caller is expected to clean up its arguments; the callee 2705 // isn't going to do that. 2706 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2707 return false; 2708 2709 // Do not sibcall optimize vararg calls unless all arguments are passed via 2710 // registers. 2711 if (isVarArg && !Outs.empty()) { 2712 2713 // Optimizing for varargs on Win64 is unlikely to be safe without 2714 // additional testing. 2715 if (Subtarget->isTargetWin64()) 2716 return false; 2717 2718 SmallVector<CCValAssign, 16> ArgLocs; 2719 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2720 getTargetMachine(), ArgLocs, *DAG.getContext()); 2721 2722 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2723 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2724 if (!ArgLocs[i].isRegLoc()) 2725 return false; 2726 } 2727 2728 // If the call result is in ST0 / ST1, it needs to be popped off the x87 2729 // stack. Therefore, if it's not used by the call it is not safe to optimize 2730 // this into a sibcall. 2731 bool Unused = false; 2732 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2733 if (!Ins[i].Used) { 2734 Unused = true; 2735 break; 2736 } 2737 } 2738 if (Unused) { 2739 SmallVector<CCValAssign, 16> RVLocs; 2740 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2741 getTargetMachine(), RVLocs, *DAG.getContext()); 2742 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2743 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2744 CCValAssign &VA = RVLocs[i]; 2745 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2746 return false; 2747 } 2748 } 2749 2750 // If the calling conventions do not match, then we'd better make sure the 2751 // results are returned in the same way as what the caller expects. 2752 if (!CCMatch) { 2753 SmallVector<CCValAssign, 16> RVLocs1; 2754 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2755 getTargetMachine(), RVLocs1, *DAG.getContext()); 2756 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2757 2758 SmallVector<CCValAssign, 16> RVLocs2; 2759 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2760 getTargetMachine(), RVLocs2, *DAG.getContext()); 2761 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2762 2763 if (RVLocs1.size() != RVLocs2.size()) 2764 return false; 2765 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2766 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2767 return false; 2768 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2769 return false; 2770 if (RVLocs1[i].isRegLoc()) { 2771 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2772 return false; 2773 } else { 2774 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2775 return false; 2776 } 2777 } 2778 } 2779 2780 // If the callee takes no arguments then go on to check the results of the 2781 // call. 2782 if (!Outs.empty()) { 2783 // Check if stack adjustment is needed. For now, do not do this if any 2784 // argument is passed on the stack. 2785 SmallVector<CCValAssign, 16> ArgLocs; 2786 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2787 getTargetMachine(), ArgLocs, *DAG.getContext()); 2788 2789 // Allocate shadow area for Win64 2790 if (Subtarget->isTargetWin64()) { 2791 CCInfo.AllocateStack(32, 8); 2792 } 2793 2794 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2795 if (CCInfo.getNextStackOffset()) { 2796 MachineFunction &MF = DAG.getMachineFunction(); 2797 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2798 return false; 2799 2800 // Check if the arguments are already laid out in the right way as 2801 // the caller's fixed stack objects. 2802 MachineFrameInfo *MFI = MF.getFrameInfo(); 2803 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2804 const X86InstrInfo *TII = 2805 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2806 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2807 CCValAssign &VA = ArgLocs[i]; 2808 SDValue Arg = OutVals[i]; 2809 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2810 if (VA.getLocInfo() == CCValAssign::Indirect) 2811 return false; 2812 if (!VA.isRegLoc()) { 2813 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2814 MFI, MRI, TII)) 2815 return false; 2816 } 2817 } 2818 } 2819 2820 // If the tailcall address may be in a register, then make sure it's 2821 // possible to register allocate for it. In 32-bit, the call address can 2822 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2823 // callee-saved registers are restored. These happen to be the same 2824 // registers used to pass 'inreg' arguments so watch out for those. 2825 if (!Subtarget->is64Bit() && 2826 !isa<GlobalAddressSDNode>(Callee) && 2827 !isa<ExternalSymbolSDNode>(Callee)) { 2828 unsigned NumInRegs = 0; 2829 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2830 CCValAssign &VA = ArgLocs[i]; 2831 if (!VA.isRegLoc()) 2832 continue; 2833 unsigned Reg = VA.getLocReg(); 2834 switch (Reg) { 2835 default: break; 2836 case X86::EAX: case X86::EDX: case X86::ECX: 2837 if (++NumInRegs == 3) 2838 return false; 2839 break; 2840 } 2841 } 2842 } 2843 } 2844 2845 return true; 2846} 2847 2848FastISel * 2849X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2850 return X86::createFastISel(funcInfo); 2851} 2852 2853 2854//===----------------------------------------------------------------------===// 2855// Other Lowering Hooks 2856//===----------------------------------------------------------------------===// 2857 2858static bool MayFoldLoad(SDValue Op) { 2859 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2860} 2861 2862static bool MayFoldIntoStore(SDValue Op) { 2863 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2864} 2865 2866static bool isTargetShuffle(unsigned Opcode) { 2867 switch(Opcode) { 2868 default: return false; 2869 case X86ISD::PSHUFD: 2870 case X86ISD::PSHUFHW: 2871 case X86ISD::PSHUFLW: 2872 case X86ISD::SHUFPD: 2873 case X86ISD::PALIGN: 2874 case X86ISD::SHUFPS: 2875 case X86ISD::MOVLHPS: 2876 case X86ISD::MOVLHPD: 2877 case X86ISD::MOVHLPS: 2878 case X86ISD::MOVLPS: 2879 case X86ISD::MOVLPD: 2880 case X86ISD::MOVSHDUP: 2881 case X86ISD::MOVSLDUP: 2882 case X86ISD::MOVDDUP: 2883 case X86ISD::MOVSS: 2884 case X86ISD::MOVSD: 2885 case X86ISD::UNPCKL: 2886 case X86ISD::UNPCKH: 2887 case X86ISD::VPERMILP: 2888 case X86ISD::VPERM2X128: 2889 return true; 2890 } 2891 return false; 2892} 2893 2894static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2895 SDValue V1, SelectionDAG &DAG) { 2896 switch(Opc) { 2897 default: llvm_unreachable("Unknown x86 shuffle node"); 2898 case X86ISD::MOVSHDUP: 2899 case X86ISD::MOVSLDUP: 2900 case X86ISD::MOVDDUP: 2901 return DAG.getNode(Opc, dl, VT, V1); 2902 } 2903 2904 return SDValue(); 2905} 2906 2907static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2908 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2909 switch(Opc) { 2910 default: llvm_unreachable("Unknown x86 shuffle node"); 2911 case X86ISD::PSHUFD: 2912 case X86ISD::PSHUFHW: 2913 case X86ISD::PSHUFLW: 2914 case X86ISD::VPERMILP: 2915 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2916 } 2917 2918 return SDValue(); 2919} 2920 2921static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2922 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2923 switch(Opc) { 2924 default: llvm_unreachable("Unknown x86 shuffle node"); 2925 case X86ISD::PALIGN: 2926 case X86ISD::SHUFPD: 2927 case X86ISD::SHUFPS: 2928 case X86ISD::VPERM2X128: 2929 return DAG.getNode(Opc, dl, VT, V1, V2, 2930 DAG.getConstant(TargetMask, MVT::i8)); 2931 } 2932 return SDValue(); 2933} 2934 2935static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2936 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2937 switch(Opc) { 2938 default: llvm_unreachable("Unknown x86 shuffle node"); 2939 case X86ISD::MOVLHPS: 2940 case X86ISD::MOVLHPD: 2941 case X86ISD::MOVHLPS: 2942 case X86ISD::MOVLPS: 2943 case X86ISD::MOVLPD: 2944 case X86ISD::MOVSS: 2945 case X86ISD::MOVSD: 2946 case X86ISD::UNPCKL: 2947 case X86ISD::UNPCKH: 2948 return DAG.getNode(Opc, dl, VT, V1, V2); 2949 } 2950 return SDValue(); 2951} 2952 2953SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2954 MachineFunction &MF = DAG.getMachineFunction(); 2955 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2956 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2957 2958 if (ReturnAddrIndex == 0) { 2959 // Set up a frame object for the return address. 2960 uint64_t SlotSize = TD->getPointerSize(); 2961 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2962 false); 2963 FuncInfo->setRAIndex(ReturnAddrIndex); 2964 } 2965 2966 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2967} 2968 2969 2970bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2971 bool hasSymbolicDisplacement) { 2972 // Offset should fit into 32 bit immediate field. 2973 if (!isInt<32>(Offset)) 2974 return false; 2975 2976 // If we don't have a symbolic displacement - we don't have any extra 2977 // restrictions. 2978 if (!hasSymbolicDisplacement) 2979 return true; 2980 2981 // FIXME: Some tweaks might be needed for medium code model. 2982 if (M != CodeModel::Small && M != CodeModel::Kernel) 2983 return false; 2984 2985 // For small code model we assume that latest object is 16MB before end of 31 2986 // bits boundary. We may also accept pretty large negative constants knowing 2987 // that all objects are in the positive half of address space. 2988 if (M == CodeModel::Small && Offset < 16*1024*1024) 2989 return true; 2990 2991 // For kernel code model we know that all object resist in the negative half 2992 // of 32bits address space. We may not accept negative offsets, since they may 2993 // be just off and we may accept pretty large positive ones. 2994 if (M == CodeModel::Kernel && Offset > 0) 2995 return true; 2996 2997 return false; 2998} 2999 3000/// isCalleePop - Determines whether the callee is required to pop its 3001/// own arguments. Callee pop is necessary to support tail calls. 3002bool X86::isCalleePop(CallingConv::ID CallingConv, 3003 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 3004 if (IsVarArg) 3005 return false; 3006 3007 switch (CallingConv) { 3008 default: 3009 return false; 3010 case CallingConv::X86_StdCall: 3011 return !is64Bit; 3012 case CallingConv::X86_FastCall: 3013 return !is64Bit; 3014 case CallingConv::X86_ThisCall: 3015 return !is64Bit; 3016 case CallingConv::Fast: 3017 return TailCallOpt; 3018 case CallingConv::GHC: 3019 return TailCallOpt; 3020 } 3021} 3022 3023/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 3024/// specific condition code, returning the condition code and the LHS/RHS of the 3025/// comparison to make. 3026static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 3027 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 3028 if (!isFP) { 3029 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 3030 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 3031 // X > -1 -> X == 0, jump !sign. 3032 RHS = DAG.getConstant(0, RHS.getValueType()); 3033 return X86::COND_NS; 3034 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 3035 // X < 0 -> X == 0, jump on sign. 3036 return X86::COND_S; 3037 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 3038 // X < 1 -> X <= 0 3039 RHS = DAG.getConstant(0, RHS.getValueType()); 3040 return X86::COND_LE; 3041 } 3042 } 3043 3044 switch (SetCCOpcode) { 3045 default: llvm_unreachable("Invalid integer condition!"); 3046 case ISD::SETEQ: return X86::COND_E; 3047 case ISD::SETGT: return X86::COND_G; 3048 case ISD::SETGE: return X86::COND_GE; 3049 case ISD::SETLT: return X86::COND_L; 3050 case ISD::SETLE: return X86::COND_LE; 3051 case ISD::SETNE: return X86::COND_NE; 3052 case ISD::SETULT: return X86::COND_B; 3053 case ISD::SETUGT: return X86::COND_A; 3054 case ISD::SETULE: return X86::COND_BE; 3055 case ISD::SETUGE: return X86::COND_AE; 3056 } 3057 } 3058 3059 // First determine if it is required or is profitable to flip the operands. 3060 3061 // If LHS is a foldable load, but RHS is not, flip the condition. 3062 if (ISD::isNON_EXTLoad(LHS.getNode()) && 3063 !ISD::isNON_EXTLoad(RHS.getNode())) { 3064 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 3065 std::swap(LHS, RHS); 3066 } 3067 3068 switch (SetCCOpcode) { 3069 default: break; 3070 case ISD::SETOLT: 3071 case ISD::SETOLE: 3072 case ISD::SETUGT: 3073 case ISD::SETUGE: 3074 std::swap(LHS, RHS); 3075 break; 3076 } 3077 3078 // On a floating point condition, the flags are set as follows: 3079 // ZF PF CF op 3080 // 0 | 0 | 0 | X > Y 3081 // 0 | 0 | 1 | X < Y 3082 // 1 | 0 | 0 | X == Y 3083 // 1 | 1 | 1 | unordered 3084 switch (SetCCOpcode) { 3085 default: llvm_unreachable("Condcode should be pre-legalized away"); 3086 case ISD::SETUEQ: 3087 case ISD::SETEQ: return X86::COND_E; 3088 case ISD::SETOLT: // flipped 3089 case ISD::SETOGT: 3090 case ISD::SETGT: return X86::COND_A; 3091 case ISD::SETOLE: // flipped 3092 case ISD::SETOGE: 3093 case ISD::SETGE: return X86::COND_AE; 3094 case ISD::SETUGT: // flipped 3095 case ISD::SETULT: 3096 case ISD::SETLT: return X86::COND_B; 3097 case ISD::SETUGE: // flipped 3098 case ISD::SETULE: 3099 case ISD::SETLE: return X86::COND_BE; 3100 case ISD::SETONE: 3101 case ISD::SETNE: return X86::COND_NE; 3102 case ISD::SETUO: return X86::COND_P; 3103 case ISD::SETO: return X86::COND_NP; 3104 case ISD::SETOEQ: 3105 case ISD::SETUNE: return X86::COND_INVALID; 3106 } 3107} 3108 3109/// hasFPCMov - is there a floating point cmov for the specific X86 condition 3110/// code. Current x86 isa includes the following FP cmov instructions: 3111/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 3112static bool hasFPCMov(unsigned X86CC) { 3113 switch (X86CC) { 3114 default: 3115 return false; 3116 case X86::COND_B: 3117 case X86::COND_BE: 3118 case X86::COND_E: 3119 case X86::COND_P: 3120 case X86::COND_A: 3121 case X86::COND_AE: 3122 case X86::COND_NE: 3123 case X86::COND_NP: 3124 return true; 3125 } 3126} 3127 3128/// isFPImmLegal - Returns true if the target can instruction select the 3129/// specified FP immediate natively. If false, the legalizer will 3130/// materialize the FP immediate as a load from a constant pool. 3131bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 3132 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 3133 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 3134 return true; 3135 } 3136 return false; 3137} 3138 3139/// isUndefOrInRange - Return true if Val is undef or if its value falls within 3140/// the specified range (L, H]. 3141static bool isUndefOrInRange(int Val, int Low, int Hi) { 3142 return (Val < 0) || (Val >= Low && Val < Hi); 3143} 3144 3145/// isUndefOrInRange - Return true if every element in Mask, begining 3146/// from position Pos and ending in Pos+Size, falls within the specified 3147/// range (L, L+Pos]. or is undef. 3148static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask, 3149 int Pos, int Size, int Low, int Hi) { 3150 for (int i = Pos, e = Pos+Size; i != e; ++i) 3151 if (!isUndefOrInRange(Mask[i], Low, Hi)) 3152 return false; 3153 return true; 3154} 3155 3156/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3157/// specified value. 3158static bool isUndefOrEqual(int Val, int CmpVal) { 3159 if (Val < 0 || Val == CmpVal) 3160 return true; 3161 return false; 3162} 3163 3164/// isSequentialOrUndefInRange - Return true if every element in Mask, begining 3165/// from position Pos and ending in Pos+Size, falls within the specified 3166/// sequential range (L, L+Pos]. or is undef. 3167static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask, 3168 int Pos, int Size, int Low) { 3169 for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) 3170 if (!isUndefOrEqual(Mask[i], Low)) 3171 return false; 3172 return true; 3173} 3174 3175/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3176/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3177/// the second operand. 3178static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3179 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3180 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3181 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3182 return (Mask[0] < 2 && Mask[1] < 2); 3183 return false; 3184} 3185 3186bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 3187 SmallVector<int, 8> M; 3188 N->getMask(M); 3189 return ::isPSHUFDMask(M, N->getValueType(0)); 3190} 3191 3192/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3193/// is suitable for input to PSHUFHW. 3194static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3195 if (VT != MVT::v8i16) 3196 return false; 3197 3198 // Lower quadword copied in order or undef. 3199 for (int i = 0; i != 4; ++i) 3200 if (Mask[i] >= 0 && Mask[i] != i) 3201 return false; 3202 3203 // Upper quadword shuffled. 3204 for (int i = 4; i != 8; ++i) 3205 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 3206 return false; 3207 3208 return true; 3209} 3210 3211bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 3212 SmallVector<int, 8> M; 3213 N->getMask(M); 3214 return ::isPSHUFHWMask(M, N->getValueType(0)); 3215} 3216 3217/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3218/// is suitable for input to PSHUFLW. 3219static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3220 if (VT != MVT::v8i16) 3221 return false; 3222 3223 // Upper quadword copied in order. 3224 for (int i = 4; i != 8; ++i) 3225 if (Mask[i] >= 0 && Mask[i] != i) 3226 return false; 3227 3228 // Lower quadword shuffled. 3229 for (int i = 0; i != 4; ++i) 3230 if (Mask[i] >= 4) 3231 return false; 3232 3233 return true; 3234} 3235 3236bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 3237 SmallVector<int, 8> M; 3238 N->getMask(M); 3239 return ::isPSHUFLWMask(M, N->getValueType(0)); 3240} 3241 3242/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3243/// is suitable for input to PALIGNR. 3244static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 3245 bool hasSSSE3OrAVX) { 3246 int i, e = VT.getVectorNumElements(); 3247 if (VT.getSizeInBits() != 128) 3248 return false; 3249 3250 // Do not handle v2i64 / v2f64 shuffles with palignr. 3251 if (e < 4 || !hasSSSE3OrAVX) 3252 return false; 3253 3254 for (i = 0; i != e; ++i) 3255 if (Mask[i] >= 0) 3256 break; 3257 3258 // All undef, not a palignr. 3259 if (i == e) 3260 return false; 3261 3262 // Make sure we're shifting in the right direction. 3263 if (Mask[i] <= i) 3264 return false; 3265 3266 int s = Mask[i] - i; 3267 3268 // Check the rest of the elements to see if they are consecutive. 3269 for (++i; i != e; ++i) { 3270 int m = Mask[i]; 3271 if (m >= 0 && m != s+i) 3272 return false; 3273 } 3274 return true; 3275} 3276 3277/// isVSHUFPYMask - Return true if the specified VECTOR_SHUFFLE operand 3278/// specifies a shuffle of elements that is suitable for input to 256-bit 3279/// VSHUFPSY. 3280static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT, 3281 bool HasAVX, bool Commuted = false) { 3282 int NumElems = VT.getVectorNumElements(); 3283 3284 if (!HasAVX || VT.getSizeInBits() != 256) 3285 return false; 3286 3287 if (NumElems != 4 && NumElems != 8) 3288 return false; 3289 3290 // VSHUFPSY divides the resulting vector into 4 chunks. 3291 // The sources are also splitted into 4 chunks, and each destination 3292 // chunk must come from a different source chunk. 3293 // 3294 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 3295 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 3296 // 3297 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 3298 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 3299 // 3300 // VSHUFPDY divides the resulting vector into 4 chunks. 3301 // The sources are also splitted into 4 chunks, and each destination 3302 // chunk must come from a different source chunk. 3303 // 3304 // SRC1 => X3 X2 X1 X0 3305 // SRC2 => Y3 Y2 Y1 Y0 3306 // 3307 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 3308 // 3309 unsigned QuarterSize = NumElems/4; 3310 unsigned HalfSize = QuarterSize*2; 3311 for (unsigned l = 0; l != 2; ++l) { 3312 unsigned LaneStart = l*HalfSize; 3313 for (unsigned s = 0; s != 2; ++s) { 3314 unsigned QuarterStart = s*QuarterSize; 3315 unsigned Src = (Commuted) ? (1-s) : s; 3316 unsigned SrcStart = Src*NumElems + LaneStart; 3317 for (unsigned i = 0; i != QuarterSize; ++i) { 3318 int Idx = Mask[i+QuarterStart+LaneStart]; 3319 if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize)) 3320 return false; 3321 // For VSHUFPSY, the mask of the second half must be the same as the 3322 // first but with the appropriate offsets. This works in the same way as 3323 // VPERMILPS works with masks. 3324 if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0) 3325 continue; 3326 if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+HalfSize)) 3327 return false; 3328 } 3329 } 3330 } 3331 3332 return true; 3333} 3334 3335/// getShuffleVSHUFPYImmediate - Return the appropriate immediate to shuffle 3336/// the specified VECTOR_MASK mask with VSHUFPSY/VSHUFPDY instructions. 3337static unsigned getShuffleVSHUFPYImmediate(SDNode *N) { 3338 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3339 EVT VT = SVOp->getValueType(0); 3340 int NumElems = VT.getVectorNumElements(); 3341 3342 assert(VT.getSizeInBits() == 256 && "Only supports 256-bit types"); 3343 assert((NumElems == 4 || NumElems == 8) && "Only supports v4 and v8 types"); 3344 3345 int HalfSize = NumElems/2; 3346 unsigned Mul = (NumElems == 8) ? 2 : 1; 3347 unsigned Mask = 0; 3348 for (int i = 0; i != NumElems; ++i) { 3349 int Elt = SVOp->getMaskElt(i); 3350 if (Elt < 0) 3351 continue; 3352 Elt %= HalfSize; 3353 unsigned Shamt = i; 3354 // For VSHUFPSY, the mask of the first half must be equal to the second one. 3355 if (NumElems == 8) Shamt %= HalfSize; 3356 Mask |= Elt << (Shamt*Mul); 3357 } 3358 3359 return Mask; 3360} 3361 3362/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3363/// the two vector operands have swapped position. 3364static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, 3365 unsigned NumElems) { 3366 for (unsigned i = 0; i != NumElems; ++i) { 3367 int idx = Mask[i]; 3368 if (idx < 0) 3369 continue; 3370 else if (idx < (int)NumElems) 3371 Mask[i] = idx + NumElems; 3372 else 3373 Mask[i] = idx - NumElems; 3374 } 3375} 3376 3377/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3378/// specifies a shuffle of elements that is suitable for input to 128-bit 3379/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be 3380/// reverse of what x86 shuffles want. 3381static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT, 3382 bool Commuted = false) { 3383 unsigned NumElems = VT.getVectorNumElements(); 3384 3385 if (VT.getSizeInBits() != 128) 3386 return false; 3387 3388 if (NumElems != 2 && NumElems != 4) 3389 return false; 3390 3391 unsigned Half = NumElems / 2; 3392 unsigned SrcStart = Commuted ? NumElems : 0; 3393 for (unsigned i = 0; i != Half; ++i) 3394 if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) 3395 return false; 3396 SrcStart = Commuted ? 0 : NumElems; 3397 for (unsigned i = Half; i != NumElems; ++i) 3398 if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) 3399 return false; 3400 3401 return true; 3402} 3403 3404bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 3405 SmallVector<int, 8> M; 3406 N->getMask(M); 3407 return ::isSHUFPMask(M, N->getValueType(0)); 3408} 3409 3410/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3411/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3412bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 3413 EVT VT = N->getValueType(0); 3414 unsigned NumElems = VT.getVectorNumElements(); 3415 3416 if (VT.getSizeInBits() != 128) 3417 return false; 3418 3419 if (NumElems != 4) 3420 return false; 3421 3422 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3423 return isUndefOrEqual(N->getMaskElt(0), 6) && 3424 isUndefOrEqual(N->getMaskElt(1), 7) && 3425 isUndefOrEqual(N->getMaskElt(2), 2) && 3426 isUndefOrEqual(N->getMaskElt(3), 3); 3427} 3428 3429/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3430/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3431/// <2, 3, 2, 3> 3432bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3433 EVT VT = N->getValueType(0); 3434 unsigned NumElems = VT.getVectorNumElements(); 3435 3436 if (VT.getSizeInBits() != 128) 3437 return false; 3438 3439 if (NumElems != 4) 3440 return false; 3441 3442 return isUndefOrEqual(N->getMaskElt(0), 2) && 3443 isUndefOrEqual(N->getMaskElt(1), 3) && 3444 isUndefOrEqual(N->getMaskElt(2), 2) && 3445 isUndefOrEqual(N->getMaskElt(3), 3); 3446} 3447 3448/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3449/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3450bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3451 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3452 3453 if (NumElems != 2 && NumElems != 4) 3454 return false; 3455 3456 for (unsigned i = 0; i < NumElems/2; ++i) 3457 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3458 return false; 3459 3460 for (unsigned i = NumElems/2; i < NumElems; ++i) 3461 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3462 return false; 3463 3464 return true; 3465} 3466 3467/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3468/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3469bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3470 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3471 3472 if ((NumElems != 2 && NumElems != 4) 3473 || N->getValueType(0).getSizeInBits() > 128) 3474 return false; 3475 3476 for (unsigned i = 0; i < NumElems/2; ++i) 3477 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3478 return false; 3479 3480 for (unsigned i = 0; i < NumElems/2; ++i) 3481 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3482 return false; 3483 3484 return true; 3485} 3486 3487/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3488/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3489static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3490 bool HasAVX2, bool V2IsSplat = false) { 3491 unsigned NumElts = VT.getVectorNumElements(); 3492 3493 assert((VT.is128BitVector() || VT.is256BitVector()) && 3494 "Unsupported vector type for unpckh"); 3495 3496 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3497 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3498 return false; 3499 3500 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3501 // independently on 128-bit lanes. 3502 unsigned NumLanes = VT.getSizeInBits()/128; 3503 unsigned NumLaneElts = NumElts/NumLanes; 3504 3505 for (unsigned l = 0; l != NumLanes; ++l) { 3506 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3507 i != (l+1)*NumLaneElts; 3508 i += 2, ++j) { 3509 int BitI = Mask[i]; 3510 int BitI1 = Mask[i+1]; 3511 if (!isUndefOrEqual(BitI, j)) 3512 return false; 3513 if (V2IsSplat) { 3514 if (!isUndefOrEqual(BitI1, NumElts)) 3515 return false; 3516 } else { 3517 if (!isUndefOrEqual(BitI1, j + NumElts)) 3518 return false; 3519 } 3520 } 3521 } 3522 3523 return true; 3524} 3525 3526bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) { 3527 SmallVector<int, 8> M; 3528 N->getMask(M); 3529 return ::isUNPCKLMask(M, N->getValueType(0), HasAVX2, V2IsSplat); 3530} 3531 3532/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3533/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3534static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3535 bool HasAVX2, bool V2IsSplat = false) { 3536 unsigned NumElts = VT.getVectorNumElements(); 3537 3538 assert((VT.is128BitVector() || VT.is256BitVector()) && 3539 "Unsupported vector type for unpckh"); 3540 3541 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3542 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3543 return false; 3544 3545 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3546 // independently on 128-bit lanes. 3547 unsigned NumLanes = VT.getSizeInBits()/128; 3548 unsigned NumLaneElts = NumElts/NumLanes; 3549 3550 for (unsigned l = 0; l != NumLanes; ++l) { 3551 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3552 i != (l+1)*NumLaneElts; i += 2, ++j) { 3553 int BitI = Mask[i]; 3554 int BitI1 = Mask[i+1]; 3555 if (!isUndefOrEqual(BitI, j)) 3556 return false; 3557 if (V2IsSplat) { 3558 if (isUndefOrEqual(BitI1, NumElts)) 3559 return false; 3560 } else { 3561 if (!isUndefOrEqual(BitI1, j+NumElts)) 3562 return false; 3563 } 3564 } 3565 } 3566 return true; 3567} 3568 3569bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) { 3570 SmallVector<int, 8> M; 3571 N->getMask(M); 3572 return ::isUNPCKHMask(M, N->getValueType(0), HasAVX2, V2IsSplat); 3573} 3574 3575/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3576/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3577/// <0, 0, 1, 1> 3578static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT, 3579 bool HasAVX2) { 3580 unsigned NumElts = VT.getVectorNumElements(); 3581 3582 assert((VT.is128BitVector() || VT.is256BitVector()) && 3583 "Unsupported vector type for unpckh"); 3584 3585 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3586 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3587 return false; 3588 3589 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 3590 // FIXME: Need a better way to get rid of this, there's no latency difference 3591 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 3592 // the former later. We should also remove the "_undef" special mask. 3593 if (NumElts == 4 && VT.getSizeInBits() == 256) 3594 return false; 3595 3596 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3597 // independently on 128-bit lanes. 3598 unsigned NumLanes = VT.getSizeInBits()/128; 3599 unsigned NumLaneElts = NumElts/NumLanes; 3600 3601 for (unsigned l = 0; l != NumLanes; ++l) { 3602 for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; 3603 i != (l+1)*NumLaneElts; 3604 i += 2, ++j) { 3605 int BitI = Mask[i]; 3606 int BitI1 = Mask[i+1]; 3607 3608 if (!isUndefOrEqual(BitI, j)) 3609 return false; 3610 if (!isUndefOrEqual(BitI1, j)) 3611 return false; 3612 } 3613 } 3614 3615 return true; 3616} 3617 3618bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N, bool HasAVX2) { 3619 SmallVector<int, 8> M; 3620 N->getMask(M); 3621 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0), HasAVX2); 3622} 3623 3624/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3625/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3626/// <2, 2, 3, 3> 3627static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT, 3628 bool HasAVX2) { 3629 unsigned NumElts = VT.getVectorNumElements(); 3630 3631 assert((VT.is128BitVector() || VT.is256BitVector()) && 3632 "Unsupported vector type for unpckh"); 3633 3634 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && 3635 (!HasAVX2 || (NumElts != 16 && NumElts != 32))) 3636 return false; 3637 3638 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3639 // independently on 128-bit lanes. 3640 unsigned NumLanes = VT.getSizeInBits()/128; 3641 unsigned NumLaneElts = NumElts/NumLanes; 3642 3643 for (unsigned l = 0; l != NumLanes; ++l) { 3644 for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; 3645 i != (l+1)*NumLaneElts; i += 2, ++j) { 3646 int BitI = Mask[i]; 3647 int BitI1 = Mask[i+1]; 3648 if (!isUndefOrEqual(BitI, j)) 3649 return false; 3650 if (!isUndefOrEqual(BitI1, j)) 3651 return false; 3652 } 3653 } 3654 return true; 3655} 3656 3657bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N, bool HasAVX2) { 3658 SmallVector<int, 8> M; 3659 N->getMask(M); 3660 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0), HasAVX2); 3661} 3662 3663/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3664/// specifies a shuffle of elements that is suitable for input to MOVSS, 3665/// MOVSD, and MOVD, i.e. setting the lowest element. 3666static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3667 if (VT.getVectorElementType().getSizeInBits() < 32) 3668 return false; 3669 3670 int NumElts = VT.getVectorNumElements(); 3671 3672 if (!isUndefOrEqual(Mask[0], NumElts)) 3673 return false; 3674 3675 for (int i = 1; i < NumElts; ++i) 3676 if (!isUndefOrEqual(Mask[i], i)) 3677 return false; 3678 3679 return true; 3680} 3681 3682bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3683 SmallVector<int, 8> M; 3684 N->getMask(M); 3685 return ::isMOVLMask(M, N->getValueType(0)); 3686} 3687 3688/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered 3689/// as permutations between 128-bit chunks or halves. As an example: this 3690/// shuffle bellow: 3691/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 3692/// The first half comes from the second half of V1 and the second half from the 3693/// the second half of V2. 3694static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT, 3695 bool HasAVX) { 3696 if (!HasAVX || VT.getSizeInBits() != 256) 3697 return false; 3698 3699 // The shuffle result is divided into half A and half B. In total the two 3700 // sources have 4 halves, namely: C, D, E, F. The final values of A and 3701 // B must come from C, D, E or F. 3702 int HalfSize = VT.getVectorNumElements()/2; 3703 bool MatchA = false, MatchB = false; 3704 3705 // Check if A comes from one of C, D, E, F. 3706 for (int Half = 0; Half < 4; ++Half) { 3707 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 3708 MatchA = true; 3709 break; 3710 } 3711 } 3712 3713 // Check if B comes from one of C, D, E, F. 3714 for (int Half = 0; Half < 4; ++Half) { 3715 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 3716 MatchB = true; 3717 break; 3718 } 3719 } 3720 3721 return MatchA && MatchB; 3722} 3723 3724/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle 3725/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. 3726static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { 3727 EVT VT = SVOp->getValueType(0); 3728 3729 int HalfSize = VT.getVectorNumElements()/2; 3730 3731 int FstHalf = 0, SndHalf = 0; 3732 for (int i = 0; i < HalfSize; ++i) { 3733 if (SVOp->getMaskElt(i) > 0) { 3734 FstHalf = SVOp->getMaskElt(i)/HalfSize; 3735 break; 3736 } 3737 } 3738 for (int i = HalfSize; i < HalfSize*2; ++i) { 3739 if (SVOp->getMaskElt(i) > 0) { 3740 SndHalf = SVOp->getMaskElt(i)/HalfSize; 3741 break; 3742 } 3743 } 3744 3745 return (FstHalf | (SndHalf << 4)); 3746} 3747 3748/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand 3749/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 3750/// Note that VPERMIL mask matching is different depending whether theunderlying 3751/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3752/// to the same elements of the low, but to the higher half of the source. 3753/// In VPERMILPD the two lanes could be shuffled independently of each other 3754/// with the same restriction that lanes can't be crossed. 3755static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT, 3756 bool HasAVX) { 3757 int NumElts = VT.getVectorNumElements(); 3758 int NumLanes = VT.getSizeInBits()/128; 3759 3760 if (!HasAVX) 3761 return false; 3762 3763 // Only match 256-bit with 32/64-bit types 3764 if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) 3765 return false; 3766 3767 int LaneSize = NumElts/NumLanes; 3768 for (int l = 0; l != NumLanes; ++l) { 3769 int LaneStart = l*LaneSize; 3770 for (int i = 0; i != LaneSize; ++i) { 3771 if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize)) 3772 return false; 3773 if (NumElts == 4 || l == 0) 3774 continue; 3775 // VPERMILPS handling 3776 if (Mask[i] < 0) 3777 continue; 3778 if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneSize)) 3779 return false; 3780 } 3781 } 3782 3783 return true; 3784} 3785 3786/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle 3787/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions. 3788static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) { 3789 EVT VT = SVOp->getValueType(0); 3790 3791 int NumElts = VT.getVectorNumElements(); 3792 int NumLanes = VT.getSizeInBits()/128; 3793 int LaneSize = NumElts/NumLanes; 3794 3795 // Although the mask is equal for both lanes do it twice to get the cases 3796 // where a mask will match because the same mask element is undef on the 3797 // first half but valid on the second. This would get pathological cases 3798 // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid. 3799 unsigned Shift = (LaneSize == 4) ? 2 : 1; 3800 unsigned Mask = 0; 3801 for (int i = 0; i != NumElts; ++i) { 3802 int MaskElt = SVOp->getMaskElt(i); 3803 if (MaskElt < 0) 3804 continue; 3805 MaskElt %= LaneSize; 3806 unsigned Shamt = i; 3807 // VPERMILPSY, the mask of the first half must be equal to the second one 3808 if (NumElts == 8) Shamt %= LaneSize; 3809 Mask |= MaskElt << (Shamt*Shift); 3810 } 3811 3812 return Mask; 3813} 3814 3815/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3816/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3817/// element of vector 2 and the other elements to come from vector 1 in order. 3818static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3819 bool V2IsSplat = false, bool V2IsUndef = false) { 3820 int NumOps = VT.getVectorNumElements(); 3821 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3822 return false; 3823 3824 if (!isUndefOrEqual(Mask[0], 0)) 3825 return false; 3826 3827 for (int i = 1; i < NumOps; ++i) 3828 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3829 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3830 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3831 return false; 3832 3833 return true; 3834} 3835 3836static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3837 bool V2IsUndef = false) { 3838 SmallVector<int, 8> M; 3839 N->getMask(M); 3840 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3841} 3842 3843/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3844/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3845/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3846bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, 3847 const X86Subtarget *Subtarget) { 3848 if (!Subtarget->hasSSE3orAVX()) 3849 return false; 3850 3851 // The second vector must be undef 3852 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3853 return false; 3854 3855 EVT VT = N->getValueType(0); 3856 unsigned NumElems = VT.getVectorNumElements(); 3857 3858 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3859 (VT.getSizeInBits() == 256 && NumElems != 8)) 3860 return false; 3861 3862 // "i+1" is the value the indexed mask element must have 3863 for (unsigned i = 0; i < NumElems; i += 2) 3864 if (!isUndefOrEqual(N->getMaskElt(i), i+1) || 3865 !isUndefOrEqual(N->getMaskElt(i+1), i+1)) 3866 return false; 3867 3868 return true; 3869} 3870 3871/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3872/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3873/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3874bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, 3875 const X86Subtarget *Subtarget) { 3876 if (!Subtarget->hasSSE3orAVX()) 3877 return false; 3878 3879 // The second vector must be undef 3880 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3881 return false; 3882 3883 EVT VT = N->getValueType(0); 3884 unsigned NumElems = VT.getVectorNumElements(); 3885 3886 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3887 (VT.getSizeInBits() == 256 && NumElems != 8)) 3888 return false; 3889 3890 // "i" is the value the indexed mask element must have 3891 for (unsigned i = 0; i < NumElems; i += 2) 3892 if (!isUndefOrEqual(N->getMaskElt(i), i) || 3893 !isUndefOrEqual(N->getMaskElt(i+1), i)) 3894 return false; 3895 3896 return true; 3897} 3898 3899/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 3900/// specifies a shuffle of elements that is suitable for input to 256-bit 3901/// version of MOVDDUP. 3902static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT, 3903 bool HasAVX) { 3904 int NumElts = VT.getVectorNumElements(); 3905 3906 if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4) 3907 return false; 3908 3909 for (int i = 0; i != NumElts/2; ++i) 3910 if (!isUndefOrEqual(Mask[i], 0)) 3911 return false; 3912 for (int i = NumElts/2; i != NumElts; ++i) 3913 if (!isUndefOrEqual(Mask[i], NumElts/2)) 3914 return false; 3915 return true; 3916} 3917 3918/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3919/// specifies a shuffle of elements that is suitable for input to 128-bit 3920/// version of MOVDDUP. 3921bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3922 EVT VT = N->getValueType(0); 3923 3924 if (VT.getSizeInBits() != 128) 3925 return false; 3926 3927 int e = VT.getVectorNumElements() / 2; 3928 for (int i = 0; i < e; ++i) 3929 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3930 return false; 3931 for (int i = 0; i < e; ++i) 3932 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3933 return false; 3934 return true; 3935} 3936 3937/// isVEXTRACTF128Index - Return true if the specified 3938/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3939/// suitable for input to VEXTRACTF128. 3940bool X86::isVEXTRACTF128Index(SDNode *N) { 3941 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3942 return false; 3943 3944 // The index should be aligned on a 128-bit boundary. 3945 uint64_t Index = 3946 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3947 3948 unsigned VL = N->getValueType(0).getVectorNumElements(); 3949 unsigned VBits = N->getValueType(0).getSizeInBits(); 3950 unsigned ElSize = VBits / VL; 3951 bool Result = (Index * ElSize) % 128 == 0; 3952 3953 return Result; 3954} 3955 3956/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3957/// operand specifies a subvector insert that is suitable for input to 3958/// VINSERTF128. 3959bool X86::isVINSERTF128Index(SDNode *N) { 3960 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3961 return false; 3962 3963 // The index should be aligned on a 128-bit boundary. 3964 uint64_t Index = 3965 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3966 3967 unsigned VL = N->getValueType(0).getVectorNumElements(); 3968 unsigned VBits = N->getValueType(0).getSizeInBits(); 3969 unsigned ElSize = VBits / VL; 3970 bool Result = (Index * ElSize) % 128 == 0; 3971 3972 return Result; 3973} 3974 3975/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3976/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3977unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3978 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3979 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3980 3981 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3982 unsigned Mask = 0; 3983 for (int i = 0; i < NumOperands; ++i) { 3984 int Val = SVOp->getMaskElt(NumOperands-i-1); 3985 if (Val < 0) Val = 0; 3986 if (Val >= NumOperands) Val -= NumOperands; 3987 Mask |= Val; 3988 if (i != NumOperands - 1) 3989 Mask <<= Shift; 3990 } 3991 return Mask; 3992} 3993 3994/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3995/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3996unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3997 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3998 unsigned Mask = 0; 3999 // 8 nodes, but we only care about the last 4. 4000 for (unsigned i = 7; i >= 4; --i) { 4001 int Val = SVOp->getMaskElt(i); 4002 if (Val >= 0) 4003 Mask |= (Val - 4); 4004 if (i != 4) 4005 Mask <<= 2; 4006 } 4007 return Mask; 4008} 4009 4010/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 4011/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 4012unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 4013 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 4014 unsigned Mask = 0; 4015 // 8 nodes, but we only care about the first 4. 4016 for (int i = 3; i >= 0; --i) { 4017 int Val = SVOp->getMaskElt(i); 4018 if (Val >= 0) 4019 Mask |= Val; 4020 if (i != 0) 4021 Mask <<= 2; 4022 } 4023 return Mask; 4024} 4025 4026/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 4027/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 4028static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { 4029 EVT VT = SVOp->getValueType(0); 4030 unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; 4031 int Val = 0; 4032 4033 unsigned i, e; 4034 for (i = 0, e = VT.getVectorNumElements(); i != e; ++i) { 4035 Val = SVOp->getMaskElt(i); 4036 if (Val >= 0) 4037 break; 4038 } 4039 assert(Val - i > 0 && "PALIGNR imm should be positive"); 4040 return (Val - i) * EltSize; 4041} 4042 4043/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 4044/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 4045/// instructions. 4046unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 4047 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 4048 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 4049 4050 uint64_t Index = 4051 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 4052 4053 EVT VecVT = N->getOperand(0).getValueType(); 4054 EVT ElVT = VecVT.getVectorElementType(); 4055 4056 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4057 return Index / NumElemsPerChunk; 4058} 4059 4060/// getInsertVINSERTF128Immediate - Return the appropriate immediate 4061/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 4062/// instructions. 4063unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 4064 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 4065 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 4066 4067 uint64_t Index = 4068 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 4069 4070 EVT VecVT = N->getValueType(0); 4071 EVT ElVT = VecVT.getVectorElementType(); 4072 4073 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 4074 return Index / NumElemsPerChunk; 4075} 4076 4077/// isZeroNode - Returns true if Elt is a constant zero or a floating point 4078/// constant +0.0. 4079bool X86::isZeroNode(SDValue Elt) { 4080 return ((isa<ConstantSDNode>(Elt) && 4081 cast<ConstantSDNode>(Elt)->isNullValue()) || 4082 (isa<ConstantFPSDNode>(Elt) && 4083 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 4084} 4085 4086/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 4087/// their permute mask. 4088static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 4089 SelectionDAG &DAG) { 4090 EVT VT = SVOp->getValueType(0); 4091 unsigned NumElems = VT.getVectorNumElements(); 4092 SmallVector<int, 8> MaskVec; 4093 4094 for (unsigned i = 0; i != NumElems; ++i) { 4095 int idx = SVOp->getMaskElt(i); 4096 if (idx < 0) 4097 MaskVec.push_back(idx); 4098 else if (idx < (int)NumElems) 4099 MaskVec.push_back(idx + NumElems); 4100 else 4101 MaskVec.push_back(idx - NumElems); 4102 } 4103 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 4104 SVOp->getOperand(0), &MaskVec[0]); 4105} 4106 4107/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 4108/// match movhlps. The lower half elements should come from upper half of 4109/// V1 (and in order), and the upper half elements should come from the upper 4110/// half of V2 (and in order). 4111static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 4112 EVT VT = Op->getValueType(0); 4113 if (VT.getSizeInBits() != 128) 4114 return false; 4115 if (VT.getVectorNumElements() != 4) 4116 return false; 4117 for (unsigned i = 0, e = 2; i != e; ++i) 4118 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 4119 return false; 4120 for (unsigned i = 2; i != 4; ++i) 4121 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 4122 return false; 4123 return true; 4124} 4125 4126/// isScalarLoadToVector - Returns true if the node is a scalar load that 4127/// is promoted to a vector. It also returns the LoadSDNode by reference if 4128/// required. 4129static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 4130 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 4131 return false; 4132 N = N->getOperand(0).getNode(); 4133 if (!ISD::isNON_EXTLoad(N)) 4134 return false; 4135 if (LD) 4136 *LD = cast<LoadSDNode>(N); 4137 return true; 4138} 4139 4140// Test whether the given value is a vector value which will be legalized 4141// into a load. 4142static bool WillBeConstantPoolLoad(SDNode *N) { 4143 if (N->getOpcode() != ISD::BUILD_VECTOR) 4144 return false; 4145 4146 // Check for any non-constant elements. 4147 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 4148 switch (N->getOperand(i).getNode()->getOpcode()) { 4149 case ISD::UNDEF: 4150 case ISD::ConstantFP: 4151 case ISD::Constant: 4152 break; 4153 default: 4154 return false; 4155 } 4156 4157 // Vectors of all-zeros and all-ones are materialized with special 4158 // instructions rather than being loaded. 4159 return !ISD::isBuildVectorAllZeros(N) && 4160 !ISD::isBuildVectorAllOnes(N); 4161} 4162 4163/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 4164/// match movlp{s|d}. The lower half elements should come from lower half of 4165/// V1 (and in order), and the upper half elements should come from the upper 4166/// half of V2 (and in order). And since V1 will become the source of the 4167/// MOVLP, it must be either a vector load or a scalar load to vector. 4168static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 4169 ShuffleVectorSDNode *Op) { 4170 EVT VT = Op->getValueType(0); 4171 if (VT.getSizeInBits() != 128) 4172 return false; 4173 4174 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 4175 return false; 4176 // Is V2 is a vector load, don't do this transformation. We will try to use 4177 // load folding shufps op. 4178 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) 4179 return false; 4180 4181 unsigned NumElems = VT.getVectorNumElements(); 4182 4183 if (NumElems != 2 && NumElems != 4) 4184 return false; 4185 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 4186 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 4187 return false; 4188 for (unsigned i = NumElems/2; i != NumElems; ++i) 4189 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 4190 return false; 4191 return true; 4192} 4193 4194/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 4195/// all the same. 4196static bool isSplatVector(SDNode *N) { 4197 if (N->getOpcode() != ISD::BUILD_VECTOR) 4198 return false; 4199 4200 SDValue SplatValue = N->getOperand(0); 4201 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 4202 if (N->getOperand(i) != SplatValue) 4203 return false; 4204 return true; 4205} 4206 4207/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 4208/// to an zero vector. 4209/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 4210static bool isZeroShuffle(ShuffleVectorSDNode *N) { 4211 SDValue V1 = N->getOperand(0); 4212 SDValue V2 = N->getOperand(1); 4213 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 4214 for (unsigned i = 0; i != NumElems; ++i) { 4215 int Idx = N->getMaskElt(i); 4216 if (Idx >= (int)NumElems) { 4217 unsigned Opc = V2.getOpcode(); 4218 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 4219 continue; 4220 if (Opc != ISD::BUILD_VECTOR || 4221 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 4222 return false; 4223 } else if (Idx >= 0) { 4224 unsigned Opc = V1.getOpcode(); 4225 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 4226 continue; 4227 if (Opc != ISD::BUILD_VECTOR || 4228 !X86::isZeroNode(V1.getOperand(Idx))) 4229 return false; 4230 } 4231 } 4232 return true; 4233} 4234 4235/// getZeroVector - Returns a vector of specified type with all zero elements. 4236/// 4237static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, 4238 DebugLoc dl) { 4239 assert(VT.isVector() && "Expected a vector type"); 4240 4241 // Always build SSE zero vectors as <4 x i32> bitcasted 4242 // to their dest type. This ensures they get CSE'd. 4243 SDValue Vec; 4244 if (VT.getSizeInBits() == 128) { // SSE 4245 if (HasXMMInt) { // SSE2 4246 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 4247 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4248 } else { // SSE1 4249 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4250 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 4251 } 4252 } else if (VT.getSizeInBits() == 256) { // AVX 4253 // 256-bit logic and arithmetic instructions in AVX are 4254 // all floating-point, no support for integer ops. Default 4255 // to emitting fp zeroed vectors then. 4256 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 4257 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4258 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 4259 } 4260 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4261} 4262 4263/// getOnesVector - Returns a vector of specified type with all bits set. 4264/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 4265/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 4266/// Then bitcast to their original type, ensuring they get CSE'd. 4267static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, 4268 DebugLoc dl) { 4269 assert(VT.isVector() && "Expected a vector type"); 4270 assert((VT.is128BitVector() || VT.is256BitVector()) 4271 && "Expected a 128-bit or 256-bit vector type"); 4272 4273 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 4274 SDValue Vec; 4275 if (VT.getSizeInBits() == 256) { 4276 if (HasAVX2) { // AVX2 4277 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 4278 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); 4279 } else { // AVX 4280 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4281 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), 4282 Vec, DAG.getConstant(0, MVT::i32), DAG, dl); 4283 Vec = Insert128BitVector(InsV, Vec, 4284 DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); 4285 } 4286 } else { 4287 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 4288 } 4289 4290 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 4291} 4292 4293/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 4294/// that point to V2 points to its first element. 4295static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4296 EVT VT = SVOp->getValueType(0); 4297 unsigned NumElems = VT.getVectorNumElements(); 4298 4299 bool Changed = false; 4300 SmallVector<int, 8> MaskVec; 4301 SVOp->getMask(MaskVec); 4302 4303 for (unsigned i = 0; i != NumElems; ++i) { 4304 if (MaskVec[i] > (int)NumElems) { 4305 MaskVec[i] = NumElems; 4306 Changed = true; 4307 } 4308 } 4309 if (Changed) 4310 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 4311 SVOp->getOperand(1), &MaskVec[0]); 4312 return SDValue(SVOp, 0); 4313} 4314 4315/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 4316/// operation of specified width. 4317static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4318 SDValue V2) { 4319 unsigned NumElems = VT.getVectorNumElements(); 4320 SmallVector<int, 8> Mask; 4321 Mask.push_back(NumElems); 4322 for (unsigned i = 1; i != NumElems; ++i) 4323 Mask.push_back(i); 4324 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4325} 4326 4327/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 4328static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4329 SDValue V2) { 4330 unsigned NumElems = VT.getVectorNumElements(); 4331 SmallVector<int, 8> Mask; 4332 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 4333 Mask.push_back(i); 4334 Mask.push_back(i + NumElems); 4335 } 4336 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4337} 4338 4339/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 4340static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4341 SDValue V2) { 4342 unsigned NumElems = VT.getVectorNumElements(); 4343 unsigned Half = NumElems/2; 4344 SmallVector<int, 8> Mask; 4345 for (unsigned i = 0; i != Half; ++i) { 4346 Mask.push_back(i + Half); 4347 Mask.push_back(i + NumElems + Half); 4348 } 4349 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4350} 4351 4352// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 4353// a generic shuffle instruction because the target has no such instructions. 4354// Generate shuffles which repeat i16 and i8 several times until they can be 4355// represented by v4f32 and then be manipulated by target suported shuffles. 4356static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4357 EVT VT = V.getValueType(); 4358 int NumElems = VT.getVectorNumElements(); 4359 DebugLoc dl = V.getDebugLoc(); 4360 4361 while (NumElems > 4) { 4362 if (EltNo < NumElems/2) { 4363 V = getUnpackl(DAG, dl, VT, V, V); 4364 } else { 4365 V = getUnpackh(DAG, dl, VT, V, V); 4366 EltNo -= NumElems/2; 4367 } 4368 NumElems >>= 1; 4369 } 4370 return V; 4371} 4372 4373/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4374static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4375 EVT VT = V.getValueType(); 4376 DebugLoc dl = V.getDebugLoc(); 4377 assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) 4378 && "Vector size not supported"); 4379 4380 if (VT.getSizeInBits() == 128) { 4381 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 4382 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4383 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 4384 &SplatMask[0]); 4385 } else { 4386 // To use VPERMILPS to splat scalars, the second half of indicies must 4387 // refer to the higher part, which is a duplication of the lower one, 4388 // because VPERMILPS can only handle in-lane permutations. 4389 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4390 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4391 4392 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 4393 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 4394 &SplatMask[0]); 4395 } 4396 4397 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4398} 4399 4400/// PromoteSplat - Splat is promoted to target supported vector shuffles. 4401static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4402 EVT SrcVT = SV->getValueType(0); 4403 SDValue V1 = SV->getOperand(0); 4404 DebugLoc dl = SV->getDebugLoc(); 4405 4406 int EltNo = SV->getSplatIndex(); 4407 int NumElems = SrcVT.getVectorNumElements(); 4408 unsigned Size = SrcVT.getSizeInBits(); 4409 4410 assert(((Size == 128 && NumElems > 4) || Size == 256) && 4411 "Unknown how to promote splat for type"); 4412 4413 // Extract the 128-bit part containing the splat element and update 4414 // the splat element index when it refers to the higher register. 4415 if (Size == 256) { 4416 unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0; 4417 V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); 4418 if (Idx > 0) 4419 EltNo -= NumElems/2; 4420 } 4421 4422 // All i16 and i8 vector types can't be used directly by a generic shuffle 4423 // instruction because the target has no such instruction. Generate shuffles 4424 // which repeat i16 and i8 several times until they fit in i32, and then can 4425 // be manipulated by target suported shuffles. 4426 EVT EltVT = SrcVT.getVectorElementType(); 4427 if (EltVT == MVT::i8 || EltVT == MVT::i16) 4428 V1 = PromoteSplati8i16(V1, DAG, EltNo); 4429 4430 // Recreate the 256-bit vector and place the same 128-bit vector 4431 // into the low and high part. This is necessary because we want 4432 // to use VPERM* to shuffle the vectors 4433 if (Size == 256) { 4434 SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, 4435 DAG.getConstant(0, MVT::i32), DAG, dl); 4436 V1 = Insert128BitVector(InsV, V1, 4437 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4438 } 4439 4440 return getLegalSplat(DAG, V1, EltNo); 4441} 4442 4443/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4444/// vector of zero or undef vector. This produces a shuffle where the low 4445/// element of V2 is swizzled into the zero/undef vector, landing at element 4446/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4447static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4448 bool isZero, bool HasXMMInt, 4449 SelectionDAG &DAG) { 4450 EVT VT = V2.getValueType(); 4451 SDValue V1 = isZero 4452 ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4453 unsigned NumElems = VT.getVectorNumElements(); 4454 SmallVector<int, 16> MaskVec; 4455 for (unsigned i = 0; i != NumElems; ++i) 4456 // If this is the insertion idx, put the low elt of V2 here. 4457 MaskVec.push_back(i == Idx ? NumElems : i); 4458 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4459} 4460 4461/// getShuffleScalarElt - Returns the scalar element that will make up the ith 4462/// element of the result of the vector shuffle. 4463static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 4464 unsigned Depth) { 4465 if (Depth == 6) 4466 return SDValue(); // Limit search depth. 4467 4468 SDValue V = SDValue(N, 0); 4469 EVT VT = V.getValueType(); 4470 unsigned Opcode = V.getOpcode(); 4471 4472 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4473 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4474 Index = SV->getMaskElt(Index); 4475 4476 if (Index < 0) 4477 return DAG.getUNDEF(VT.getVectorElementType()); 4478 4479 int NumElems = VT.getVectorNumElements(); 4480 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 4481 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 4482 } 4483 4484 // Recurse into target specific vector shuffles to find scalars. 4485 if (isTargetShuffle(Opcode)) { 4486 int NumElems = VT.getVectorNumElements(); 4487 SmallVector<unsigned, 16> ShuffleMask; 4488 SDValue ImmN; 4489 4490 switch(Opcode) { 4491 case X86ISD::SHUFPS: 4492 case X86ISD::SHUFPD: 4493 ImmN = N->getOperand(N->getNumOperands()-1); 4494 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4495 ShuffleMask); 4496 break; 4497 case X86ISD::UNPCKH: 4498 DecodeUNPCKHMask(VT, ShuffleMask); 4499 break; 4500 case X86ISD::UNPCKL: 4501 DecodeUNPCKLMask(VT, ShuffleMask); 4502 break; 4503 case X86ISD::MOVHLPS: 4504 DecodeMOVHLPSMask(NumElems, ShuffleMask); 4505 break; 4506 case X86ISD::MOVLHPS: 4507 DecodeMOVLHPSMask(NumElems, ShuffleMask); 4508 break; 4509 case X86ISD::PSHUFD: 4510 ImmN = N->getOperand(N->getNumOperands()-1); 4511 DecodePSHUFMask(NumElems, 4512 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4513 ShuffleMask); 4514 break; 4515 case X86ISD::PSHUFHW: 4516 ImmN = N->getOperand(N->getNumOperands()-1); 4517 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4518 ShuffleMask); 4519 break; 4520 case X86ISD::PSHUFLW: 4521 ImmN = N->getOperand(N->getNumOperands()-1); 4522 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4523 ShuffleMask); 4524 break; 4525 case X86ISD::MOVSS: 4526 case X86ISD::MOVSD: { 4527 // The index 0 always comes from the first element of the second source, 4528 // this is why MOVSS and MOVSD are used in the first place. The other 4529 // elements come from the other positions of the first source vector. 4530 unsigned OpNum = (Index == 0) ? 1 : 0; 4531 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 4532 Depth+1); 4533 } 4534 case X86ISD::VPERMILP: 4535 ImmN = N->getOperand(N->getNumOperands()-1); 4536 DecodeVPERMILPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4537 ShuffleMask); 4538 break; 4539 case X86ISD::VPERM2X128: 4540 ImmN = N->getOperand(N->getNumOperands()-1); 4541 DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4542 ShuffleMask); 4543 break; 4544 case X86ISD::MOVDDUP: 4545 case X86ISD::MOVLHPD: 4546 case X86ISD::MOVLPD: 4547 case X86ISD::MOVLPS: 4548 case X86ISD::MOVSHDUP: 4549 case X86ISD::MOVSLDUP: 4550 case X86ISD::PALIGN: 4551 return SDValue(); // Not yet implemented. 4552 default: 4553 assert(0 && "unknown target shuffle node"); 4554 return SDValue(); 4555 } 4556 4557 Index = ShuffleMask[Index]; 4558 if (Index < 0) 4559 return DAG.getUNDEF(VT.getVectorElementType()); 4560 4561 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 4562 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 4563 Depth+1); 4564 } 4565 4566 // Actual nodes that may contain scalar elements 4567 if (Opcode == ISD::BITCAST) { 4568 V = V.getOperand(0); 4569 EVT SrcVT = V.getValueType(); 4570 unsigned NumElems = VT.getVectorNumElements(); 4571 4572 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4573 return SDValue(); 4574 } 4575 4576 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4577 return (Index == 0) ? V.getOperand(0) 4578 : DAG.getUNDEF(VT.getVectorElementType()); 4579 4580 if (V.getOpcode() == ISD::BUILD_VECTOR) 4581 return V.getOperand(Index); 4582 4583 return SDValue(); 4584} 4585 4586/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4587/// shuffle operation which come from a consecutively from a zero. The 4588/// search can start in two different directions, from left or right. 4589static 4590unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 4591 bool ZerosFromLeft, SelectionDAG &DAG) { 4592 int i = 0; 4593 4594 while (i < NumElems) { 4595 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4596 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 4597 if (!(Elt.getNode() && 4598 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4599 break; 4600 ++i; 4601 } 4602 4603 return i; 4604} 4605 4606/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 4607/// MaskE correspond consecutively to elements from one of the vector operands, 4608/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4609static 4610bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 4611 int OpIdx, int NumElems, unsigned &OpNum) { 4612 bool SeenV1 = false; 4613 bool SeenV2 = false; 4614 4615 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 4616 int Idx = SVOp->getMaskElt(i); 4617 // Ignore undef indicies 4618 if (Idx < 0) 4619 continue; 4620 4621 if (Idx < NumElems) 4622 SeenV1 = true; 4623 else 4624 SeenV2 = true; 4625 4626 // Only accept consecutive elements from the same vector 4627 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4628 return false; 4629 } 4630 4631 OpNum = SeenV1 ? 0 : 1; 4632 return true; 4633} 4634 4635/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4636/// logical left shift of a vector. 4637static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4638 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4639 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4640 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4641 false /* check zeros from right */, DAG); 4642 unsigned OpSrc; 4643 4644 if (!NumZeros) 4645 return false; 4646 4647 // Considering the elements in the mask that are not consecutive zeros, 4648 // check if they consecutively come from only one of the source vectors. 4649 // 4650 // V1 = {X, A, B, C} 0 4651 // \ \ \ / 4652 // vector_shuffle V1, V2 <1, 2, 3, X> 4653 // 4654 if (!isShuffleMaskConsecutive(SVOp, 4655 0, // Mask Start Index 4656 NumElems-NumZeros-1, // Mask End Index 4657 NumZeros, // Where to start looking in the src vector 4658 NumElems, // Number of elements in vector 4659 OpSrc)) // Which source operand ? 4660 return false; 4661 4662 isLeft = false; 4663 ShAmt = NumZeros; 4664 ShVal = SVOp->getOperand(OpSrc); 4665 return true; 4666} 4667 4668/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4669/// logical left shift of a vector. 4670static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4671 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4672 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4673 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4674 true /* check zeros from left */, DAG); 4675 unsigned OpSrc; 4676 4677 if (!NumZeros) 4678 return false; 4679 4680 // Considering the elements in the mask that are not consecutive zeros, 4681 // check if they consecutively come from only one of the source vectors. 4682 // 4683 // 0 { A, B, X, X } = V2 4684 // / \ / / 4685 // vector_shuffle V1, V2 <X, X, 4, 5> 4686 // 4687 if (!isShuffleMaskConsecutive(SVOp, 4688 NumZeros, // Mask Start Index 4689 NumElems-1, // Mask End Index 4690 0, // Where to start looking in the src vector 4691 NumElems, // Number of elements in vector 4692 OpSrc)) // Which source operand ? 4693 return false; 4694 4695 isLeft = true; 4696 ShAmt = NumZeros; 4697 ShVal = SVOp->getOperand(OpSrc); 4698 return true; 4699} 4700 4701/// isVectorShift - Returns true if the shuffle can be implemented as a 4702/// logical left or right shift of a vector. 4703static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4704 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4705 // Although the logic below support any bitwidth size, there are no 4706 // shift instructions which handle more than 128-bit vectors. 4707 if (SVOp->getValueType(0).getSizeInBits() > 128) 4708 return false; 4709 4710 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4711 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4712 return true; 4713 4714 return false; 4715} 4716 4717/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4718/// 4719static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4720 unsigned NumNonZero, unsigned NumZero, 4721 SelectionDAG &DAG, 4722 const TargetLowering &TLI) { 4723 if (NumNonZero > 8) 4724 return SDValue(); 4725 4726 DebugLoc dl = Op.getDebugLoc(); 4727 SDValue V(0, 0); 4728 bool First = true; 4729 for (unsigned i = 0; i < 16; ++i) { 4730 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4731 if (ThisIsNonZero && First) { 4732 if (NumZero) 4733 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4734 else 4735 V = DAG.getUNDEF(MVT::v8i16); 4736 First = false; 4737 } 4738 4739 if ((i & 1) != 0) { 4740 SDValue ThisElt(0, 0), LastElt(0, 0); 4741 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4742 if (LastIsNonZero) { 4743 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4744 MVT::i16, Op.getOperand(i-1)); 4745 } 4746 if (ThisIsNonZero) { 4747 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4748 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4749 ThisElt, DAG.getConstant(8, MVT::i8)); 4750 if (LastIsNonZero) 4751 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4752 } else 4753 ThisElt = LastElt; 4754 4755 if (ThisElt.getNode()) 4756 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4757 DAG.getIntPtrConstant(i/2)); 4758 } 4759 } 4760 4761 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4762} 4763 4764/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4765/// 4766static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4767 unsigned NumNonZero, unsigned NumZero, 4768 SelectionDAG &DAG, 4769 const TargetLowering &TLI) { 4770 if (NumNonZero > 4) 4771 return SDValue(); 4772 4773 DebugLoc dl = Op.getDebugLoc(); 4774 SDValue V(0, 0); 4775 bool First = true; 4776 for (unsigned i = 0; i < 8; ++i) { 4777 bool isNonZero = (NonZeros & (1 << i)) != 0; 4778 if (isNonZero) { 4779 if (First) { 4780 if (NumZero) 4781 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4782 else 4783 V = DAG.getUNDEF(MVT::v8i16); 4784 First = false; 4785 } 4786 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4787 MVT::v8i16, V, Op.getOperand(i), 4788 DAG.getIntPtrConstant(i)); 4789 } 4790 } 4791 4792 return V; 4793} 4794 4795/// getVShift - Return a vector logical shift node. 4796/// 4797static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4798 unsigned NumBits, SelectionDAG &DAG, 4799 const TargetLowering &TLI, DebugLoc dl) { 4800 assert(VT.getSizeInBits() == 128 && "Unknown type for VShift"); 4801 EVT ShVT = MVT::v2i64; 4802 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 4803 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4804 return DAG.getNode(ISD::BITCAST, dl, VT, 4805 DAG.getNode(Opc, dl, ShVT, SrcOp, 4806 DAG.getConstant(NumBits, 4807 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4808} 4809 4810SDValue 4811X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4812 SelectionDAG &DAG) const { 4813 4814 // Check if the scalar load can be widened into a vector load. And if 4815 // the address is "base + cst" see if the cst can be "absorbed" into 4816 // the shuffle mask. 4817 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4818 SDValue Ptr = LD->getBasePtr(); 4819 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4820 return SDValue(); 4821 EVT PVT = LD->getValueType(0); 4822 if (PVT != MVT::i32 && PVT != MVT::f32) 4823 return SDValue(); 4824 4825 int FI = -1; 4826 int64_t Offset = 0; 4827 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4828 FI = FINode->getIndex(); 4829 Offset = 0; 4830 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4831 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4832 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4833 Offset = Ptr.getConstantOperandVal(1); 4834 Ptr = Ptr.getOperand(0); 4835 } else { 4836 return SDValue(); 4837 } 4838 4839 // FIXME: 256-bit vector instructions don't require a strict alignment, 4840 // improve this code to support it better. 4841 unsigned RequiredAlign = VT.getSizeInBits()/8; 4842 SDValue Chain = LD->getChain(); 4843 // Make sure the stack object alignment is at least 16 or 32. 4844 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4845 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 4846 if (MFI->isFixedObjectIndex(FI)) { 4847 // Can't change the alignment. FIXME: It's possible to compute 4848 // the exact stack offset and reference FI + adjust offset instead. 4849 // If someone *really* cares about this. That's the way to implement it. 4850 return SDValue(); 4851 } else { 4852 MFI->setObjectAlignment(FI, RequiredAlign); 4853 } 4854 } 4855 4856 // (Offset % 16 or 32) must be multiple of 4. Then address is then 4857 // Ptr + (Offset & ~15). 4858 if (Offset < 0) 4859 return SDValue(); 4860 if ((Offset % RequiredAlign) & 3) 4861 return SDValue(); 4862 int64_t StartOffset = Offset & ~(RequiredAlign-1); 4863 if (StartOffset) 4864 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4865 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4866 4867 int EltNo = (Offset - StartOffset) >> 2; 4868 int NumElems = VT.getVectorNumElements(); 4869 4870 EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32; 4871 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 4872 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 4873 LD->getPointerInfo().getWithOffset(StartOffset), 4874 false, false, false, 0); 4875 4876 // Canonicalize it to a v4i32 or v8i32 shuffle. 4877 SmallVector<int, 8> Mask; 4878 for (int i = 0; i < NumElems; ++i) 4879 Mask.push_back(EltNo); 4880 4881 V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1); 4882 return DAG.getNode(ISD::BITCAST, dl, NVT, 4883 DAG.getVectorShuffle(CanonVT, dl, V1, 4884 DAG.getUNDEF(CanonVT),&Mask[0])); 4885 } 4886 4887 return SDValue(); 4888} 4889 4890/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4891/// vector of type 'VT', see if the elements can be replaced by a single large 4892/// load which has the same value as a build_vector whose operands are 'elts'. 4893/// 4894/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4895/// 4896/// FIXME: we'd also like to handle the case where the last elements are zero 4897/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4898/// There's even a handy isZeroNode for that purpose. 4899static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4900 DebugLoc &DL, SelectionDAG &DAG) { 4901 EVT EltVT = VT.getVectorElementType(); 4902 unsigned NumElems = Elts.size(); 4903 4904 LoadSDNode *LDBase = NULL; 4905 unsigned LastLoadedElt = -1U; 4906 4907 // For each element in the initializer, see if we've found a load or an undef. 4908 // If we don't find an initial load element, or later load elements are 4909 // non-consecutive, bail out. 4910 for (unsigned i = 0; i < NumElems; ++i) { 4911 SDValue Elt = Elts[i]; 4912 4913 if (!Elt.getNode() || 4914 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4915 return SDValue(); 4916 if (!LDBase) { 4917 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4918 return SDValue(); 4919 LDBase = cast<LoadSDNode>(Elt.getNode()); 4920 LastLoadedElt = i; 4921 continue; 4922 } 4923 if (Elt.getOpcode() == ISD::UNDEF) 4924 continue; 4925 4926 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4927 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4928 return SDValue(); 4929 LastLoadedElt = i; 4930 } 4931 4932 // If we have found an entire vector of loads and undefs, then return a large 4933 // load of the entire vector width starting at the base pointer. If we found 4934 // consecutive loads for the low half, generate a vzext_load node. 4935 if (LastLoadedElt == NumElems - 1) { 4936 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4937 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4938 LDBase->getPointerInfo(), 4939 LDBase->isVolatile(), LDBase->isNonTemporal(), 4940 LDBase->isInvariant(), 0); 4941 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4942 LDBase->getPointerInfo(), 4943 LDBase->isVolatile(), LDBase->isNonTemporal(), 4944 LDBase->isInvariant(), LDBase->getAlignment()); 4945 } else if (NumElems == 4 && LastLoadedElt == 1 && 4946 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 4947 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4948 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4949 SDValue ResNode = 4950 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64, 4951 LDBase->getPointerInfo(), 4952 LDBase->getAlignment(), 4953 false/*isVolatile*/, true/*ReadMem*/, 4954 false/*WriteMem*/); 4955 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4956 } 4957 return SDValue(); 4958} 4959 4960/// isVectorBroadcast - Check if the node chain is suitable to be xformed to 4961/// a vbroadcast node. We support two patterns: 4962/// 1. A splat BUILD_VECTOR which uses a single scalar load. 4963/// 2. A splat shuffle which uses a scalar_to_vector node which comes from 4964/// a scalar load. 4965/// The scalar load node is returned when a pattern is found, 4966/// or SDValue() otherwise. 4967static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) { 4968 EVT VT = Op.getValueType(); 4969 SDValue V = Op; 4970 4971 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 4972 V = V.getOperand(0); 4973 4974 //A suspected load to be broadcasted. 4975 SDValue Ld; 4976 4977 switch (V.getOpcode()) { 4978 default: 4979 // Unknown pattern found. 4980 return SDValue(); 4981 4982 case ISD::BUILD_VECTOR: { 4983 // The BUILD_VECTOR node must be a splat. 4984 if (!isSplatVector(V.getNode())) 4985 return SDValue(); 4986 4987 Ld = V.getOperand(0); 4988 4989 // The suspected load node has several users. Make sure that all 4990 // of its users are from the BUILD_VECTOR node. 4991 if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) 4992 return SDValue(); 4993 break; 4994 } 4995 4996 case ISD::VECTOR_SHUFFLE: { 4997 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4998 4999 // Shuffles must have a splat mask where the first element is 5000 // broadcasted. 5001 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 5002 return SDValue(); 5003 5004 SDValue Sc = Op.getOperand(0); 5005 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR) 5006 return SDValue(); 5007 5008 Ld = Sc.getOperand(0); 5009 5010 // The scalar_to_vector node and the suspected 5011 // load node must have exactly one user. 5012 if (!Sc.hasOneUse() || !Ld.hasOneUse()) 5013 return SDValue(); 5014 break; 5015 } 5016 } 5017 5018 // The scalar source must be a normal load. 5019 if (!ISD::isNormalLoad(Ld.getNode())) 5020 return SDValue(); 5021 5022 bool Is256 = VT.getSizeInBits() == 256; 5023 bool Is128 = VT.getSizeInBits() == 128; 5024 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 5025 5026 if (hasAVX2) { 5027 // VBroadcast to YMM 5028 if (Is256 && (ScalarSize == 8 || ScalarSize == 16 || 5029 ScalarSize == 32 || ScalarSize == 64 )) 5030 return Ld; 5031 5032 // VBroadcast to XMM 5033 if (Is128 && (ScalarSize == 8 || ScalarSize == 32 || 5034 ScalarSize == 16 || ScalarSize == 64 )) 5035 return Ld; 5036 } 5037 5038 // VBroadcast to YMM 5039 if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) 5040 return Ld; 5041 5042 // VBroadcast to XMM 5043 if (Is128 && (ScalarSize == 32)) 5044 return Ld; 5045 5046 5047 // Unsupported broadcast. 5048 return SDValue(); 5049} 5050 5051SDValue 5052X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5053 DebugLoc dl = Op.getDebugLoc(); 5054 5055 EVT VT = Op.getValueType(); 5056 EVT ExtVT = VT.getVectorElementType(); 5057 unsigned NumElems = Op.getNumOperands(); 5058 5059 // Vectors containing all zeros can be matched by pxor and xorps later 5060 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 5061 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 5062 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 5063 if (Op.getValueType() == MVT::v4i32 || 5064 Op.getValueType() == MVT::v8i32) 5065 return Op; 5066 5067 return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl); 5068 } 5069 5070 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 5071 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 5072 // vpcmpeqd on 256-bit vectors. 5073 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 5074 if (Op.getValueType() == MVT::v4i32 || 5075 (Op.getValueType() == MVT::v8i32 && Subtarget->hasAVX2())) 5076 return Op; 5077 5078 return getOnesVector(Op.getValueType(), Subtarget->hasAVX2(), DAG, dl); 5079 } 5080 5081 SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2()); 5082 if (Subtarget->hasAVX() && LD.getNode()) 5083 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); 5084 5085 unsigned EVTBits = ExtVT.getSizeInBits(); 5086 5087 unsigned NumZero = 0; 5088 unsigned NumNonZero = 0; 5089 unsigned NonZeros = 0; 5090 bool IsAllConstants = true; 5091 SmallSet<SDValue, 8> Values; 5092 for (unsigned i = 0; i < NumElems; ++i) { 5093 SDValue Elt = Op.getOperand(i); 5094 if (Elt.getOpcode() == ISD::UNDEF) 5095 continue; 5096 Values.insert(Elt); 5097 if (Elt.getOpcode() != ISD::Constant && 5098 Elt.getOpcode() != ISD::ConstantFP) 5099 IsAllConstants = false; 5100 if (X86::isZeroNode(Elt)) 5101 NumZero++; 5102 else { 5103 NonZeros |= (1 << i); 5104 NumNonZero++; 5105 } 5106 } 5107 5108 // All undef vector. Return an UNDEF. All zero vectors were handled above. 5109 if (NumNonZero == 0) 5110 return DAG.getUNDEF(VT); 5111 5112 // Special case for single non-zero, non-undef, element. 5113 if (NumNonZero == 1) { 5114 unsigned Idx = CountTrailingZeros_32(NonZeros); 5115 SDValue Item = Op.getOperand(Idx); 5116 5117 // If this is an insertion of an i64 value on x86-32, and if the top bits of 5118 // the value are obviously zero, truncate the value to i32 and do the 5119 // insertion that way. Only do this if the value is non-constant or if the 5120 // value is a constant being inserted into element 0. It is cheaper to do 5121 // a constant pool load than it is to do a movd + shuffle. 5122 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 5123 (!IsAllConstants || Idx == 0)) { 5124 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 5125 // Handle SSE only. 5126 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 5127 EVT VecVT = MVT::v4i32; 5128 unsigned VecElts = 4; 5129 5130 // Truncate the value (which may itself be a constant) to i32, and 5131 // convert it to a vector with movd (S2V+shuffle to zero extend). 5132 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 5133 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 5134 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 5135 Subtarget->hasXMMInt(), DAG); 5136 5137 // Now we have our 32-bit value zero extended in the low element of 5138 // a vector. If Idx != 0, swizzle it into place. 5139 if (Idx != 0) { 5140 SmallVector<int, 4> Mask; 5141 Mask.push_back(Idx); 5142 for (unsigned i = 1; i != VecElts; ++i) 5143 Mask.push_back(i); 5144 Item = DAG.getVectorShuffle(VecVT, dl, Item, 5145 DAG.getUNDEF(Item.getValueType()), 5146 &Mask[0]); 5147 } 5148 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 5149 } 5150 } 5151 5152 // If we have a constant or non-constant insertion into the low element of 5153 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 5154 // the rest of the elements. This will be matched as movd/movq/movss/movsd 5155 // depending on what the source datatype is. 5156 if (Idx == 0) { 5157 if (NumZero == 0) { 5158 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5159 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 5160 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 5161 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5162 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 5163 return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(), 5164 DAG); 5165 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 5166 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 5167 unsigned NumBits = VT.getSizeInBits(); 5168 assert((NumBits == 128 || NumBits == 256) && 5169 "Expected an SSE or AVX value type!"); 5170 EVT MiddleVT = NumBits == 128 ? MVT::v4i32 : MVT::v8i32; 5171 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 5172 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 5173 Subtarget->hasXMMInt(), DAG); 5174 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 5175 } 5176 } 5177 5178 // Is it a vector logical left shift? 5179 if (NumElems == 2 && Idx == 1 && 5180 X86::isZeroNode(Op.getOperand(0)) && 5181 !X86::isZeroNode(Op.getOperand(1))) { 5182 unsigned NumBits = VT.getSizeInBits(); 5183 return getVShift(true, VT, 5184 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5185 VT, Op.getOperand(1)), 5186 NumBits/2, DAG, *this, dl); 5187 } 5188 5189 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 5190 return SDValue(); 5191 5192 // Otherwise, if this is a vector with i32 or f32 elements, and the element 5193 // is a non-constant being inserted into an element other than the low one, 5194 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 5195 // movd/movss) to move this into the low element, then shuffle it into 5196 // place. 5197 if (EVTBits == 32) { 5198 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 5199 5200 // Turn it into a shuffle of zero and zero-extended scalar to vector. 5201 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 5202 Subtarget->hasXMMInt(), DAG); 5203 SmallVector<int, 8> MaskVec; 5204 for (unsigned i = 0; i < NumElems; i++) 5205 MaskVec.push_back(i == Idx ? 0 : 1); 5206 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 5207 } 5208 } 5209 5210 // Splat is obviously ok. Let legalizer expand it to a shuffle. 5211 if (Values.size() == 1) { 5212 if (EVTBits == 32) { 5213 // Instead of a shuffle like this: 5214 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 5215 // Check if it's possible to issue this instead. 5216 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 5217 unsigned Idx = CountTrailingZeros_32(NonZeros); 5218 SDValue Item = Op.getOperand(Idx); 5219 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 5220 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 5221 } 5222 return SDValue(); 5223 } 5224 5225 // A vector full of immediates; various special cases are already 5226 // handled, so this is best done with a single constant-pool load. 5227 if (IsAllConstants) 5228 return SDValue(); 5229 5230 // For AVX-length vectors, build the individual 128-bit pieces and use 5231 // shuffles to put them in place. 5232 if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) { 5233 SmallVector<SDValue, 32> V; 5234 for (unsigned i = 0; i < NumElems; ++i) 5235 V.push_back(Op.getOperand(i)); 5236 5237 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 5238 5239 // Build both the lower and upper subvector. 5240 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 5241 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 5242 NumElems/2); 5243 5244 // Recreate the wider vector with the lower and upper part. 5245 SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower, 5246 DAG.getConstant(0, MVT::i32), DAG, dl); 5247 return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32), 5248 DAG, dl); 5249 } 5250 5251 // Let legalizer expand 2-wide build_vectors. 5252 if (EVTBits == 64) { 5253 if (NumNonZero == 1) { 5254 // One half is zero or undef. 5255 unsigned Idx = CountTrailingZeros_32(NonZeros); 5256 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 5257 Op.getOperand(Idx)); 5258 return getShuffleVectorZeroOrUndef(V2, Idx, true, 5259 Subtarget->hasXMMInt(), DAG); 5260 } 5261 return SDValue(); 5262 } 5263 5264 // If element VT is < 32 bits, convert it to inserts into a zero vector. 5265 if (EVTBits == 8 && NumElems == 16) { 5266 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 5267 *this); 5268 if (V.getNode()) return V; 5269 } 5270 5271 if (EVTBits == 16 && NumElems == 8) { 5272 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 5273 *this); 5274 if (V.getNode()) return V; 5275 } 5276 5277 // If element VT is == 32 bits, turn it into a number of shuffles. 5278 SmallVector<SDValue, 8> V; 5279 V.resize(NumElems); 5280 if (NumElems == 4 && NumZero > 0) { 5281 for (unsigned i = 0; i < 4; ++i) { 5282 bool isZero = !(NonZeros & (1 << i)); 5283 if (isZero) 5284 V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); 5285 else 5286 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5287 } 5288 5289 for (unsigned i = 0; i < 2; ++i) { 5290 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 5291 default: break; 5292 case 0: 5293 V[i] = V[i*2]; // Must be a zero vector. 5294 break; 5295 case 1: 5296 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 5297 break; 5298 case 2: 5299 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 5300 break; 5301 case 3: 5302 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 5303 break; 5304 } 5305 } 5306 5307 SmallVector<int, 8> MaskVec; 5308 bool Reverse = (NonZeros & 0x3) == 2; 5309 for (unsigned i = 0; i < 2; ++i) 5310 MaskVec.push_back(Reverse ? 1-i : i); 5311 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 5312 for (unsigned i = 0; i < 2; ++i) 5313 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 5314 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 5315 } 5316 5317 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 5318 // Check for a build vector of consecutive loads. 5319 for (unsigned i = 0; i < NumElems; ++i) 5320 V[i] = Op.getOperand(i); 5321 5322 // Check for elements which are consecutive loads. 5323 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 5324 if (LD.getNode()) 5325 return LD; 5326 5327 // For SSE 4.1, use insertps to put the high elements into the low element. 5328 if (getSubtarget()->hasSSE41orAVX()) { 5329 SDValue Result; 5330 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 5331 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 5332 else 5333 Result = DAG.getUNDEF(VT); 5334 5335 for (unsigned i = 1; i < NumElems; ++i) { 5336 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 5337 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 5338 Op.getOperand(i), DAG.getIntPtrConstant(i)); 5339 } 5340 return Result; 5341 } 5342 5343 // Otherwise, expand into a number of unpckl*, start by extending each of 5344 // our (non-undef) elements to the full vector width with the element in the 5345 // bottom slot of the vector (which generates no code for SSE). 5346 for (unsigned i = 0; i < NumElems; ++i) { 5347 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 5348 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 5349 else 5350 V[i] = DAG.getUNDEF(VT); 5351 } 5352 5353 // Next, we iteratively mix elements, e.g. for v4f32: 5354 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 5355 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 5356 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 5357 unsigned EltStride = NumElems >> 1; 5358 while (EltStride != 0) { 5359 for (unsigned i = 0; i < EltStride; ++i) { 5360 // If V[i+EltStride] is undef and this is the first round of mixing, 5361 // then it is safe to just drop this shuffle: V[i] is already in the 5362 // right place, the one element (since it's the first round) being 5363 // inserted as undef can be dropped. This isn't safe for successive 5364 // rounds because they will permute elements within both vectors. 5365 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 5366 EltStride == NumElems/2) 5367 continue; 5368 5369 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 5370 } 5371 EltStride >>= 1; 5372 } 5373 return V[0]; 5374 } 5375 return SDValue(); 5376} 5377 5378// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place 5379// them in a MMX register. This is better than doing a stack convert. 5380static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5381 DebugLoc dl = Op.getDebugLoc(); 5382 EVT ResVT = Op.getValueType(); 5383 5384 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 5385 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 5386 int Mask[2]; 5387 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 5388 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 5389 InVec = Op.getOperand(1); 5390 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 5391 unsigned NumElts = ResVT.getVectorNumElements(); 5392 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 5393 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 5394 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 5395 } else { 5396 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 5397 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 5398 Mask[0] = 0; Mask[1] = 2; 5399 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 5400 } 5401 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 5402} 5403 5404// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 5405// to create 256-bit vectors from two other 128-bit ones. 5406static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 5407 DebugLoc dl = Op.getDebugLoc(); 5408 EVT ResVT = Op.getValueType(); 5409 5410 assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide"); 5411 5412 SDValue V1 = Op.getOperand(0); 5413 SDValue V2 = Op.getOperand(1); 5414 unsigned NumElems = ResVT.getVectorNumElements(); 5415 5416 SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1, 5417 DAG.getConstant(0, MVT::i32), DAG, dl); 5418 return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), 5419 DAG, dl); 5420} 5421 5422SDValue 5423X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 5424 EVT ResVT = Op.getValueType(); 5425 5426 assert(Op.getNumOperands() == 2); 5427 assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) && 5428 "Unsupported CONCAT_VECTORS for value type"); 5429 5430 // We support concatenate two MMX registers and place them in a MMX register. 5431 // This is better than doing a stack convert. 5432 if (ResVT.is128BitVector()) 5433 return LowerMMXCONCAT_VECTORS(Op, DAG); 5434 5435 // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors 5436 // from two other 128-bit ones. 5437 return LowerAVXCONCAT_VECTORS(Op, DAG); 5438} 5439 5440// v8i16 shuffles - Prefer shuffles in the following order: 5441// 1. [all] pshuflw, pshufhw, optional move 5442// 2. [ssse3] 1 x pshufb 5443// 3. [ssse3] 2 x pshufb + 1 x por 5444// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 5445SDValue 5446X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 5447 SelectionDAG &DAG) const { 5448 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5449 SDValue V1 = SVOp->getOperand(0); 5450 SDValue V2 = SVOp->getOperand(1); 5451 DebugLoc dl = SVOp->getDebugLoc(); 5452 SmallVector<int, 8> MaskVals; 5453 5454 // Determine if more than 1 of the words in each of the low and high quadwords 5455 // of the result come from the same quadword of one of the two inputs. Undef 5456 // mask values count as coming from any quadword, for better codegen. 5457 unsigned LoQuad[] = { 0, 0, 0, 0 }; 5458 unsigned HiQuad[] = { 0, 0, 0, 0 }; 5459 BitVector InputQuads(4); 5460 for (unsigned i = 0; i < 8; ++i) { 5461 unsigned *Quad = i < 4 ? LoQuad : HiQuad; 5462 int EltIdx = SVOp->getMaskElt(i); 5463 MaskVals.push_back(EltIdx); 5464 if (EltIdx < 0) { 5465 ++Quad[0]; 5466 ++Quad[1]; 5467 ++Quad[2]; 5468 ++Quad[3]; 5469 continue; 5470 } 5471 ++Quad[EltIdx / 4]; 5472 InputQuads.set(EltIdx / 4); 5473 } 5474 5475 int BestLoQuad = -1; 5476 unsigned MaxQuad = 1; 5477 for (unsigned i = 0; i < 4; ++i) { 5478 if (LoQuad[i] > MaxQuad) { 5479 BestLoQuad = i; 5480 MaxQuad = LoQuad[i]; 5481 } 5482 } 5483 5484 int BestHiQuad = -1; 5485 MaxQuad = 1; 5486 for (unsigned i = 0; i < 4; ++i) { 5487 if (HiQuad[i] > MaxQuad) { 5488 BestHiQuad = i; 5489 MaxQuad = HiQuad[i]; 5490 } 5491 } 5492 5493 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 5494 // of the two input vectors, shuffle them into one input vector so only a 5495 // single pshufb instruction is necessary. If There are more than 2 input 5496 // quads, disable the next transformation since it does not help SSSE3. 5497 bool V1Used = InputQuads[0] || InputQuads[1]; 5498 bool V2Used = InputQuads[2] || InputQuads[3]; 5499 if (Subtarget->hasSSSE3orAVX()) { 5500 if (InputQuads.count() == 2 && V1Used && V2Used) { 5501 BestLoQuad = InputQuads.find_first(); 5502 BestHiQuad = InputQuads.find_next(BestLoQuad); 5503 } 5504 if (InputQuads.count() > 2) { 5505 BestLoQuad = -1; 5506 BestHiQuad = -1; 5507 } 5508 } 5509 5510 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 5511 // the shuffle mask. If a quad is scored as -1, that means that it contains 5512 // words from all 4 input quadwords. 5513 SDValue NewV; 5514 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 5515 SmallVector<int, 8> MaskV; 5516 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 5517 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 5518 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 5519 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 5520 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 5521 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 5522 5523 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 5524 // source words for the shuffle, to aid later transformations. 5525 bool AllWordsInNewV = true; 5526 bool InOrder[2] = { true, true }; 5527 for (unsigned i = 0; i != 8; ++i) { 5528 int idx = MaskVals[i]; 5529 if (idx != (int)i) 5530 InOrder[i/4] = false; 5531 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 5532 continue; 5533 AllWordsInNewV = false; 5534 break; 5535 } 5536 5537 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5538 if (AllWordsInNewV) { 5539 for (int i = 0; i != 8; ++i) { 5540 int idx = MaskVals[i]; 5541 if (idx < 0) 5542 continue; 5543 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5544 if ((idx != i) && idx < 4) 5545 pshufhw = false; 5546 if ((idx != i) && idx > 3) 5547 pshuflw = false; 5548 } 5549 V1 = NewV; 5550 V2Used = false; 5551 BestLoQuad = 0; 5552 BestHiQuad = 1; 5553 } 5554 5555 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5556 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5557 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5558 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5559 unsigned TargetMask = 0; 5560 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5561 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5562 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 5563 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 5564 V1 = NewV.getOperand(0); 5565 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5566 } 5567 } 5568 5569 // If we have SSSE3, and all words of the result are from 1 input vector, 5570 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5571 // is present, fall back to case 4. 5572 if (Subtarget->hasSSSE3orAVX()) { 5573 SmallVector<SDValue,16> pshufbMask; 5574 5575 // If we have elements from both input vectors, set the high bit of the 5576 // shuffle mask element to zero out elements that come from V2 in the V1 5577 // mask, and elements that come from V1 in the V2 mask, so that the two 5578 // results can be OR'd together. 5579 bool TwoInputs = V1Used && V2Used; 5580 for (unsigned i = 0; i != 8; ++i) { 5581 int EltIdx = MaskVals[i] * 2; 5582 if (TwoInputs && (EltIdx >= 16)) { 5583 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5584 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5585 continue; 5586 } 5587 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5588 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 5589 } 5590 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5591 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5592 DAG.getNode(ISD::BUILD_VECTOR, dl, 5593 MVT::v16i8, &pshufbMask[0], 16)); 5594 if (!TwoInputs) 5595 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5596 5597 // Calculate the shuffle mask for the second input, shuffle it, and 5598 // OR it with the first shuffled input. 5599 pshufbMask.clear(); 5600 for (unsigned i = 0; i != 8; ++i) { 5601 int EltIdx = MaskVals[i] * 2; 5602 if (EltIdx < 16) { 5603 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5604 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5605 continue; 5606 } 5607 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5608 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 5609 } 5610 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5611 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5612 DAG.getNode(ISD::BUILD_VECTOR, dl, 5613 MVT::v16i8, &pshufbMask[0], 16)); 5614 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5615 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5616 } 5617 5618 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5619 // and update MaskVals with new element order. 5620 BitVector InOrder(8); 5621 if (BestLoQuad >= 0) { 5622 SmallVector<int, 8> MaskV; 5623 for (int i = 0; i != 4; ++i) { 5624 int idx = MaskVals[i]; 5625 if (idx < 0) { 5626 MaskV.push_back(-1); 5627 InOrder.set(i); 5628 } else if ((idx / 4) == BestLoQuad) { 5629 MaskV.push_back(idx & 3); 5630 InOrder.set(i); 5631 } else { 5632 MaskV.push_back(-1); 5633 } 5634 } 5635 for (unsigned i = 4; i != 8; ++i) 5636 MaskV.push_back(i); 5637 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5638 &MaskV[0]); 5639 5640 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX()) 5641 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5642 NewV.getOperand(0), 5643 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 5644 DAG); 5645 } 5646 5647 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5648 // and update MaskVals with the new element order. 5649 if (BestHiQuad >= 0) { 5650 SmallVector<int, 8> MaskV; 5651 for (unsigned i = 0; i != 4; ++i) 5652 MaskV.push_back(i); 5653 for (unsigned i = 4; i != 8; ++i) { 5654 int idx = MaskVals[i]; 5655 if (idx < 0) { 5656 MaskV.push_back(-1); 5657 InOrder.set(i); 5658 } else if ((idx / 4) == BestHiQuad) { 5659 MaskV.push_back((idx & 3) + 4); 5660 InOrder.set(i); 5661 } else { 5662 MaskV.push_back(-1); 5663 } 5664 } 5665 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5666 &MaskV[0]); 5667 5668 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX()) 5669 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5670 NewV.getOperand(0), 5671 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 5672 DAG); 5673 } 5674 5675 // In case BestHi & BestLo were both -1, which means each quadword has a word 5676 // from each of the four input quadwords, calculate the InOrder bitvector now 5677 // before falling through to the insert/extract cleanup. 5678 if (BestLoQuad == -1 && BestHiQuad == -1) { 5679 NewV = V1; 5680 for (int i = 0; i != 8; ++i) 5681 if (MaskVals[i] < 0 || MaskVals[i] == i) 5682 InOrder.set(i); 5683 } 5684 5685 // The other elements are put in the right place using pextrw and pinsrw. 5686 for (unsigned i = 0; i != 8; ++i) { 5687 if (InOrder[i]) 5688 continue; 5689 int EltIdx = MaskVals[i]; 5690 if (EltIdx < 0) 5691 continue; 5692 SDValue ExtOp = (EltIdx < 8) 5693 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5694 DAG.getIntPtrConstant(EltIdx)) 5695 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 5696 DAG.getIntPtrConstant(EltIdx - 8)); 5697 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 5698 DAG.getIntPtrConstant(i)); 5699 } 5700 return NewV; 5701} 5702 5703// v16i8 shuffles - Prefer shuffles in the following order: 5704// 1. [ssse3] 1 x pshufb 5705// 2. [ssse3] 2 x pshufb + 1 x por 5706// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 5707static 5708SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 5709 SelectionDAG &DAG, 5710 const X86TargetLowering &TLI) { 5711 SDValue V1 = SVOp->getOperand(0); 5712 SDValue V2 = SVOp->getOperand(1); 5713 DebugLoc dl = SVOp->getDebugLoc(); 5714 SmallVector<int, 16> MaskVals; 5715 SVOp->getMask(MaskVals); 5716 5717 // If we have SSSE3, case 1 is generated when all result bytes come from 5718 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5719 // present, fall back to case 3. 5720 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 5721 bool V1Only = true; 5722 bool V2Only = true; 5723 for (unsigned i = 0; i < 16; ++i) { 5724 int EltIdx = MaskVals[i]; 5725 if (EltIdx < 0) 5726 continue; 5727 if (EltIdx < 16) 5728 V2Only = false; 5729 else 5730 V1Only = false; 5731 } 5732 5733 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5734 if (TLI.getSubtarget()->hasSSSE3orAVX()) { 5735 SmallVector<SDValue,16> pshufbMask; 5736 5737 // If all result elements are from one input vector, then only translate 5738 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5739 // 5740 // Otherwise, we have elements from both input vectors, and must zero out 5741 // elements that come from V2 in the first mask, and V1 in the second mask 5742 // so that we can OR them together. 5743 bool TwoInputs = !(V1Only || V2Only); 5744 for (unsigned i = 0; i != 16; ++i) { 5745 int EltIdx = MaskVals[i]; 5746 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 5747 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5748 continue; 5749 } 5750 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5751 } 5752 // If all the elements are from V2, assign it to V1 and return after 5753 // building the first pshufb. 5754 if (V2Only) 5755 V1 = V2; 5756 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5757 DAG.getNode(ISD::BUILD_VECTOR, dl, 5758 MVT::v16i8, &pshufbMask[0], 16)); 5759 if (!TwoInputs) 5760 return V1; 5761 5762 // Calculate the shuffle mask for the second input, shuffle it, and 5763 // OR it with the first shuffled input. 5764 pshufbMask.clear(); 5765 for (unsigned i = 0; i != 16; ++i) { 5766 int EltIdx = MaskVals[i]; 5767 if (EltIdx < 16) { 5768 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5769 continue; 5770 } 5771 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5772 } 5773 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5774 DAG.getNode(ISD::BUILD_VECTOR, dl, 5775 MVT::v16i8, &pshufbMask[0], 16)); 5776 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5777 } 5778 5779 // No SSSE3 - Calculate in place words and then fix all out of place words 5780 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5781 // the 16 different words that comprise the two doublequadword input vectors. 5782 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5783 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5784 SDValue NewV = V2Only ? V2 : V1; 5785 for (int i = 0; i != 8; ++i) { 5786 int Elt0 = MaskVals[i*2]; 5787 int Elt1 = MaskVals[i*2+1]; 5788 5789 // This word of the result is all undef, skip it. 5790 if (Elt0 < 0 && Elt1 < 0) 5791 continue; 5792 5793 // This word of the result is already in the correct place, skip it. 5794 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 5795 continue; 5796 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 5797 continue; 5798 5799 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5800 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5801 SDValue InsElt; 5802 5803 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5804 // using a single extract together, load it and store it. 5805 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5806 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5807 DAG.getIntPtrConstant(Elt1 / 2)); 5808 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5809 DAG.getIntPtrConstant(i)); 5810 continue; 5811 } 5812 5813 // If Elt1 is defined, extract it from the appropriate source. If the 5814 // source byte is not also odd, shift the extracted word left 8 bits 5815 // otherwise clear the bottom 8 bits if we need to do an or. 5816 if (Elt1 >= 0) { 5817 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5818 DAG.getIntPtrConstant(Elt1 / 2)); 5819 if ((Elt1 & 1) == 0) 5820 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5821 DAG.getConstant(8, 5822 TLI.getShiftAmountTy(InsElt.getValueType()))); 5823 else if (Elt0 >= 0) 5824 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5825 DAG.getConstant(0xFF00, MVT::i16)); 5826 } 5827 // If Elt0 is defined, extract it from the appropriate source. If the 5828 // source byte is not also even, shift the extracted word right 8 bits. If 5829 // Elt1 was also defined, OR the extracted values together before 5830 // inserting them in the result. 5831 if (Elt0 >= 0) { 5832 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5833 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5834 if ((Elt0 & 1) != 0) 5835 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5836 DAG.getConstant(8, 5837 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5838 else if (Elt1 >= 0) 5839 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5840 DAG.getConstant(0x00FF, MVT::i16)); 5841 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5842 : InsElt0; 5843 } 5844 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5845 DAG.getIntPtrConstant(i)); 5846 } 5847 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5848} 5849 5850/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 5851/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 5852/// done when every pair / quad of shuffle mask elements point to elements in 5853/// the right sequence. e.g. 5854/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 5855static 5856SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 5857 SelectionDAG &DAG, DebugLoc dl) { 5858 EVT VT = SVOp->getValueType(0); 5859 SDValue V1 = SVOp->getOperand(0); 5860 SDValue V2 = SVOp->getOperand(1); 5861 unsigned NumElems = VT.getVectorNumElements(); 5862 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 5863 EVT NewVT; 5864 switch (VT.getSimpleVT().SimpleTy) { 5865 default: assert(false && "Unexpected!"); 5866 case MVT::v4f32: NewVT = MVT::v2f64; break; 5867 case MVT::v4i32: NewVT = MVT::v2i64; break; 5868 case MVT::v8i16: NewVT = MVT::v4i32; break; 5869 case MVT::v16i8: NewVT = MVT::v4i32; break; 5870 } 5871 5872 int Scale = NumElems / NewWidth; 5873 SmallVector<int, 8> MaskVec; 5874 for (unsigned i = 0; i < NumElems; i += Scale) { 5875 int StartIdx = -1; 5876 for (int j = 0; j < Scale; ++j) { 5877 int EltIdx = SVOp->getMaskElt(i+j); 5878 if (EltIdx < 0) 5879 continue; 5880 if (StartIdx == -1) 5881 StartIdx = EltIdx - (EltIdx % Scale); 5882 if (EltIdx != StartIdx + j) 5883 return SDValue(); 5884 } 5885 if (StartIdx == -1) 5886 MaskVec.push_back(-1); 5887 else 5888 MaskVec.push_back(StartIdx / Scale); 5889 } 5890 5891 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 5892 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 5893 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 5894} 5895 5896/// getVZextMovL - Return a zero-extending vector move low node. 5897/// 5898static SDValue getVZextMovL(EVT VT, EVT OpVT, 5899 SDValue SrcOp, SelectionDAG &DAG, 5900 const X86Subtarget *Subtarget, DebugLoc dl) { 5901 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 5902 LoadSDNode *LD = NULL; 5903 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 5904 LD = dyn_cast<LoadSDNode>(SrcOp); 5905 if (!LD) { 5906 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 5907 // instead. 5908 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 5909 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 5910 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 5911 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 5912 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 5913 // PR2108 5914 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 5915 return DAG.getNode(ISD::BITCAST, dl, VT, 5916 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5917 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5918 OpVT, 5919 SrcOp.getOperand(0) 5920 .getOperand(0)))); 5921 } 5922 } 5923 } 5924 5925 return DAG.getNode(ISD::BITCAST, dl, VT, 5926 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5927 DAG.getNode(ISD::BITCAST, dl, 5928 OpVT, SrcOp))); 5929} 5930 5931/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector 5932/// shuffle node referes to only one lane in the sources. 5933static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) { 5934 EVT VT = SVOp->getValueType(0); 5935 int NumElems = VT.getVectorNumElements(); 5936 int HalfSize = NumElems/2; 5937 SmallVector<int, 16> M; 5938 SVOp->getMask(M); 5939 bool MatchA = false, MatchB = false; 5940 5941 for (int l = 0; l < NumElems*2; l += HalfSize) { 5942 if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) { 5943 MatchA = true; 5944 break; 5945 } 5946 } 5947 5948 for (int l = 0; l < NumElems*2; l += HalfSize) { 5949 if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) { 5950 MatchB = true; 5951 break; 5952 } 5953 } 5954 5955 return MatchA && MatchB; 5956} 5957 5958/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 5959/// which could not be matched by any known target speficic shuffle 5960static SDValue 5961LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5962 if (areShuffleHalvesWithinDisjointLanes(SVOp)) { 5963 // If each half of a vector shuffle node referes to only one lane in the 5964 // source vectors, extract each used 128-bit lane and shuffle them using 5965 // 128-bit shuffles. Then, concatenate the results. Otherwise leave 5966 // the work to the legalizer. 5967 DebugLoc dl = SVOp->getDebugLoc(); 5968 EVT VT = SVOp->getValueType(0); 5969 int NumElems = VT.getVectorNumElements(); 5970 int HalfSize = NumElems/2; 5971 5972 // Extract the reference for each half 5973 int FstVecExtractIdx = 0, SndVecExtractIdx = 0; 5974 int FstVecOpNum = 0, SndVecOpNum = 0; 5975 for (int i = 0; i < HalfSize; ++i) { 5976 int Elt = SVOp->getMaskElt(i); 5977 if (SVOp->getMaskElt(i) < 0) 5978 continue; 5979 FstVecOpNum = Elt/NumElems; 5980 FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; 5981 break; 5982 } 5983 for (int i = HalfSize; i < NumElems; ++i) { 5984 int Elt = SVOp->getMaskElt(i); 5985 if (SVOp->getMaskElt(i) < 0) 5986 continue; 5987 SndVecOpNum = Elt/NumElems; 5988 SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; 5989 break; 5990 } 5991 5992 // Extract the subvectors 5993 SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum), 5994 DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl); 5995 SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum), 5996 DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl); 5997 5998 // Generate 128-bit shuffles 5999 SmallVector<int, 16> MaskV1, MaskV2; 6000 for (int i = 0; i < HalfSize; ++i) { 6001 int Elt = SVOp->getMaskElt(i); 6002 MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize); 6003 } 6004 for (int i = HalfSize; i < NumElems; ++i) { 6005 int Elt = SVOp->getMaskElt(i); 6006 MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize); 6007 } 6008 6009 EVT NVT = V1.getValueType(); 6010 V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]); 6011 V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]); 6012 6013 // Concatenate the result back 6014 SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1, 6015 DAG.getConstant(0, MVT::i32), DAG, dl); 6016 return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), 6017 DAG, dl); 6018 } 6019 6020 return SDValue(); 6021} 6022 6023/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 6024/// 4 elements, and match them with several different shuffle types. 6025static SDValue 6026LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 6027 SDValue V1 = SVOp->getOperand(0); 6028 SDValue V2 = SVOp->getOperand(1); 6029 DebugLoc dl = SVOp->getDebugLoc(); 6030 EVT VT = SVOp->getValueType(0); 6031 6032 assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); 6033 6034 SmallVector<std::pair<int, int>, 8> Locs; 6035 Locs.resize(4); 6036 SmallVector<int, 8> Mask1(4U, -1); 6037 SmallVector<int, 8> PermMask; 6038 SVOp->getMask(PermMask); 6039 6040 unsigned NumHi = 0; 6041 unsigned NumLo = 0; 6042 for (unsigned i = 0; i != 4; ++i) { 6043 int Idx = PermMask[i]; 6044 if (Idx < 0) { 6045 Locs[i] = std::make_pair(-1, -1); 6046 } else { 6047 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 6048 if (Idx < 4) { 6049 Locs[i] = std::make_pair(0, NumLo); 6050 Mask1[NumLo] = Idx; 6051 NumLo++; 6052 } else { 6053 Locs[i] = std::make_pair(1, NumHi); 6054 if (2+NumHi < 4) 6055 Mask1[2+NumHi] = Idx; 6056 NumHi++; 6057 } 6058 } 6059 } 6060 6061 if (NumLo <= 2 && NumHi <= 2) { 6062 // If no more than two elements come from either vector. This can be 6063 // implemented with two shuffles. First shuffle gather the elements. 6064 // The second shuffle, which takes the first shuffle as both of its 6065 // vector operands, put the elements into the right order. 6066 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6067 6068 SmallVector<int, 8> Mask2(4U, -1); 6069 6070 for (unsigned i = 0; i != 4; ++i) { 6071 if (Locs[i].first == -1) 6072 continue; 6073 else { 6074 unsigned Idx = (i < 2) ? 0 : 4; 6075 Idx += Locs[i].first * 2 + Locs[i].second; 6076 Mask2[i] = Idx; 6077 } 6078 } 6079 6080 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 6081 } else if (NumLo == 3 || NumHi == 3) { 6082 // Otherwise, we must have three elements from one vector, call it X, and 6083 // one element from the other, call it Y. First, use a shufps to build an 6084 // intermediate vector with the one element from Y and the element from X 6085 // that will be in the same half in the final destination (the indexes don't 6086 // matter). Then, use a shufps to build the final vector, taking the half 6087 // containing the element from Y from the intermediate, and the other half 6088 // from X. 6089 if (NumHi == 3) { 6090 // Normalize it so the 3 elements come from V1. 6091 CommuteVectorShuffleMask(PermMask, 4); 6092 std::swap(V1, V2); 6093 } 6094 6095 // Find the element from V2. 6096 unsigned HiIndex; 6097 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 6098 int Val = PermMask[HiIndex]; 6099 if (Val < 0) 6100 continue; 6101 if (Val >= 4) 6102 break; 6103 } 6104 6105 Mask1[0] = PermMask[HiIndex]; 6106 Mask1[1] = -1; 6107 Mask1[2] = PermMask[HiIndex^1]; 6108 Mask1[3] = -1; 6109 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6110 6111 if (HiIndex >= 2) { 6112 Mask1[0] = PermMask[0]; 6113 Mask1[1] = PermMask[1]; 6114 Mask1[2] = HiIndex & 1 ? 6 : 4; 6115 Mask1[3] = HiIndex & 1 ? 4 : 6; 6116 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 6117 } else { 6118 Mask1[0] = HiIndex & 1 ? 2 : 0; 6119 Mask1[1] = HiIndex & 1 ? 0 : 2; 6120 Mask1[2] = PermMask[2]; 6121 Mask1[3] = PermMask[3]; 6122 if (Mask1[2] >= 0) 6123 Mask1[2] += 4; 6124 if (Mask1[3] >= 0) 6125 Mask1[3] += 4; 6126 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 6127 } 6128 } 6129 6130 // Break it into (shuffle shuffle_hi, shuffle_lo). 6131 Locs.clear(); 6132 Locs.resize(4); 6133 SmallVector<int,8> LoMask(4U, -1); 6134 SmallVector<int,8> HiMask(4U, -1); 6135 6136 SmallVector<int,8> *MaskPtr = &LoMask; 6137 unsigned MaskIdx = 0; 6138 unsigned LoIdx = 0; 6139 unsigned HiIdx = 2; 6140 for (unsigned i = 0; i != 4; ++i) { 6141 if (i == 2) { 6142 MaskPtr = &HiMask; 6143 MaskIdx = 1; 6144 LoIdx = 0; 6145 HiIdx = 2; 6146 } 6147 int Idx = PermMask[i]; 6148 if (Idx < 0) { 6149 Locs[i] = std::make_pair(-1, -1); 6150 } else if (Idx < 4) { 6151 Locs[i] = std::make_pair(MaskIdx, LoIdx); 6152 (*MaskPtr)[LoIdx] = Idx; 6153 LoIdx++; 6154 } else { 6155 Locs[i] = std::make_pair(MaskIdx, HiIdx); 6156 (*MaskPtr)[HiIdx] = Idx; 6157 HiIdx++; 6158 } 6159 } 6160 6161 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 6162 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 6163 SmallVector<int, 8> MaskOps; 6164 for (unsigned i = 0; i != 4; ++i) { 6165 if (Locs[i].first == -1) { 6166 MaskOps.push_back(-1); 6167 } else { 6168 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 6169 MaskOps.push_back(Idx); 6170 } 6171 } 6172 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 6173} 6174 6175static bool MayFoldVectorLoad(SDValue V) { 6176 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6177 V = V.getOperand(0); 6178 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6179 V = V.getOperand(0); 6180 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && 6181 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) 6182 // BUILD_VECTOR (load), undef 6183 V = V.getOperand(0); 6184 if (MayFoldLoad(V)) 6185 return true; 6186 return false; 6187} 6188 6189// FIXME: the version above should always be used. Since there's 6190// a bug where several vector shuffles can't be folded because the 6191// DAG is not updated during lowering and a node claims to have two 6192// uses while it only has one, use this version, and let isel match 6193// another instruction if the load really happens to have more than 6194// one use. Remove this version after this bug get fixed. 6195// rdar://8434668, PR8156 6196static bool RelaxedMayFoldVectorLoad(SDValue V) { 6197 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 6198 V = V.getOperand(0); 6199 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 6200 V = V.getOperand(0); 6201 if (ISD::isNormalLoad(V.getNode())) 6202 return true; 6203 return false; 6204} 6205 6206/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 6207/// a vector extract, and if both can be later optimized into a single load. 6208/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 6209/// here because otherwise a target specific shuffle node is going to be 6210/// emitted for this shuffle, and the optimization not done. 6211/// FIXME: This is probably not the best approach, but fix the problem 6212/// until the right path is decided. 6213static 6214bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 6215 const TargetLowering &TLI) { 6216 EVT VT = V.getValueType(); 6217 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 6218 6219 // Be sure that the vector shuffle is present in a pattern like this: 6220 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 6221 if (!V.hasOneUse()) 6222 return false; 6223 6224 SDNode *N = *V.getNode()->use_begin(); 6225 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 6226 return false; 6227 6228 SDValue EltNo = N->getOperand(1); 6229 if (!isa<ConstantSDNode>(EltNo)) 6230 return false; 6231 6232 // If the bit convert changed the number of elements, it is unsafe 6233 // to examine the mask. 6234 bool HasShuffleIntoBitcast = false; 6235 if (V.getOpcode() == ISD::BITCAST) { 6236 EVT SrcVT = V.getOperand(0).getValueType(); 6237 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 6238 return false; 6239 V = V.getOperand(0); 6240 HasShuffleIntoBitcast = true; 6241 } 6242 6243 // Select the input vector, guarding against out of range extract vector. 6244 unsigned NumElems = VT.getVectorNumElements(); 6245 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 6246 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 6247 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 6248 6249 // Skip one more bit_convert if necessary 6250 if (V.getOpcode() == ISD::BITCAST) 6251 V = V.getOperand(0); 6252 6253 if (ISD::isNormalLoad(V.getNode())) { 6254 // Is the original load suitable? 6255 LoadSDNode *LN0 = cast<LoadSDNode>(V); 6256 6257 // FIXME: avoid the multi-use bug that is preventing lots of 6258 // of foldings to be detected, this is still wrong of course, but 6259 // give the temporary desired behavior, and if it happens that 6260 // the load has real more uses, during isel it will not fold, and 6261 // will generate poor code. 6262 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 6263 return false; 6264 6265 if (!HasShuffleIntoBitcast) 6266 return true; 6267 6268 // If there's a bitcast before the shuffle, check if the load type and 6269 // alignment is valid. 6270 unsigned Align = LN0->getAlignment(); 6271 unsigned NewAlign = 6272 TLI.getTargetData()->getABITypeAlignment( 6273 VT.getTypeForEVT(*DAG.getContext())); 6274 6275 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 6276 return false; 6277 } 6278 6279 return true; 6280} 6281 6282static 6283SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 6284 EVT VT = Op.getValueType(); 6285 6286 // Canonizalize to v2f64. 6287 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 6288 return DAG.getNode(ISD::BITCAST, dl, VT, 6289 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 6290 V1, DAG)); 6291} 6292 6293static 6294SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 6295 bool HasXMMInt) { 6296 SDValue V1 = Op.getOperand(0); 6297 SDValue V2 = Op.getOperand(1); 6298 EVT VT = Op.getValueType(); 6299 6300 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 6301 6302 if (HasXMMInt && VT == MVT::v2f64) 6303 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 6304 6305 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) 6306 return DAG.getNode(ISD::BITCAST, dl, VT, 6307 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, 6308 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), 6309 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); 6310} 6311 6312static 6313SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 6314 SDValue V1 = Op.getOperand(0); 6315 SDValue V2 = Op.getOperand(1); 6316 EVT VT = Op.getValueType(); 6317 6318 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 6319 "unsupported shuffle type"); 6320 6321 if (V2.getOpcode() == ISD::UNDEF) 6322 V2 = V1; 6323 6324 // v4i32 or v4f32 6325 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 6326} 6327 6328static inline unsigned getSHUFPOpcode(EVT VT) { 6329 switch(VT.getSimpleVT().SimpleTy) { 6330 case MVT::v8i32: // Use fp unit for int unpack. 6331 case MVT::v8f32: 6332 case MVT::v4i32: // Use fp unit for int unpack. 6333 case MVT::v4f32: return X86ISD::SHUFPS; 6334 case MVT::v4i64: // Use fp unit for int unpack. 6335 case MVT::v4f64: 6336 case MVT::v2i64: // Use fp unit for int unpack. 6337 case MVT::v2f64: return X86ISD::SHUFPD; 6338 default: 6339 llvm_unreachable("Unknown type for shufp*"); 6340 } 6341 return 0; 6342} 6343 6344static 6345SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { 6346 SDValue V1 = Op.getOperand(0); 6347 SDValue V2 = Op.getOperand(1); 6348 EVT VT = Op.getValueType(); 6349 unsigned NumElems = VT.getVectorNumElements(); 6350 6351 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 6352 // operand of these instructions is only memory, so check if there's a 6353 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 6354 // same masks. 6355 bool CanFoldLoad = false; 6356 6357 // Trivial case, when V2 comes from a load. 6358 if (MayFoldVectorLoad(V2)) 6359 CanFoldLoad = true; 6360 6361 // When V1 is a load, it can be folded later into a store in isel, example: 6362 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 6363 // turns into: 6364 // (MOVLPSmr addr:$src1, VR128:$src2) 6365 // So, recognize this potential and also use MOVLPS or MOVLPD 6366 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 6367 CanFoldLoad = true; 6368 6369 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6370 if (CanFoldLoad) { 6371 if (HasXMMInt && NumElems == 2) 6372 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 6373 6374 if (NumElems == 4) 6375 // If we don't care about the second element, procede to use movss. 6376 if (SVOp->getMaskElt(1) != -1) 6377 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 6378 } 6379 6380 // movl and movlp will both match v2i64, but v2i64 is never matched by 6381 // movl earlier because we make it strict to avoid messing with the movlp load 6382 // folding logic (see the code above getMOVLP call). Match it here then, 6383 // this is horrible, but will stay like this until we move all shuffle 6384 // matching to x86 specific nodes. Note that for the 1st condition all 6385 // types are matched with movsd. 6386 if (HasXMMInt) { 6387 // FIXME: isMOVLMask should be checked and matched before getMOVLP, 6388 // as to remove this logic from here, as much as possible 6389 if (NumElems == 2 || !X86::isMOVLMask(SVOp)) 6390 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6391 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6392 } 6393 6394 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 6395 6396 // Invert the operand order and use SHUFPS to match it. 6397 return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1, 6398 X86::getShuffleSHUFImmediate(SVOp), DAG); 6399} 6400 6401static 6402SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 6403 const TargetLowering &TLI, 6404 const X86Subtarget *Subtarget) { 6405 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6406 EVT VT = Op.getValueType(); 6407 DebugLoc dl = Op.getDebugLoc(); 6408 SDValue V1 = Op.getOperand(0); 6409 SDValue V2 = Op.getOperand(1); 6410 6411 if (isZeroShuffle(SVOp)) 6412 return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); 6413 6414 // Handle splat operations 6415 if (SVOp->isSplat()) { 6416 unsigned NumElem = VT.getVectorNumElements(); 6417 int Size = VT.getSizeInBits(); 6418 // Special case, this is the only place now where it's allowed to return 6419 // a vector_shuffle operation without using a target specific node, because 6420 // *hopefully* it will be optimized away by the dag combiner. FIXME: should 6421 // this be moved to DAGCombine instead? 6422 if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 6423 return Op; 6424 6425 // Use vbroadcast whenever the splat comes from a foldable load 6426 SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2()); 6427 if (Subtarget->hasAVX() && LD.getNode()) 6428 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); 6429 6430 // Handle splats by matching through known shuffle masks 6431 if ((Size == 128 && NumElem <= 4) || 6432 (Size == 256 && NumElem < 8)) 6433 return SDValue(); 6434 6435 // All remaning splats are promoted to target supported vector shuffles. 6436 return PromoteSplat(SVOp, DAG); 6437 } 6438 6439 // If the shuffle can be profitably rewritten as a narrower shuffle, then 6440 // do it! 6441 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 6442 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6443 if (NewOp.getNode()) 6444 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 6445 } else if ((VT == MVT::v4i32 || 6446 (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) { 6447 // FIXME: Figure out a cleaner way to do this. 6448 // Try to make use of movq to zero out the top part. 6449 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 6450 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6451 if (NewOp.getNode()) { 6452 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 6453 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 6454 DAG, Subtarget, dl); 6455 } 6456 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 6457 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 6458 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 6459 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 6460 DAG, Subtarget, dl); 6461 } 6462 } 6463 return SDValue(); 6464} 6465 6466SDValue 6467X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 6468 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 6469 SDValue V1 = Op.getOperand(0); 6470 SDValue V2 = Op.getOperand(1); 6471 EVT VT = Op.getValueType(); 6472 DebugLoc dl = Op.getDebugLoc(); 6473 unsigned NumElems = VT.getVectorNumElements(); 6474 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 6475 bool V1IsSplat = false; 6476 bool V2IsSplat = false; 6477 bool HasXMMInt = Subtarget->hasXMMInt(); 6478 bool HasAVX = Subtarget->hasAVX(); 6479 bool HasAVX2 = Subtarget->hasAVX2(); 6480 MachineFunction &MF = DAG.getMachineFunction(); 6481 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 6482 6483 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 6484 6485 assert(V1.getOpcode() != ISD::UNDEF && "Op 1 of shuffle should not be undef"); 6486 6487 // Vector shuffle lowering takes 3 steps: 6488 // 6489 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 6490 // narrowing and commutation of operands should be handled. 6491 // 2) Matching of shuffles with known shuffle masks to x86 target specific 6492 // shuffle nodes. 6493 // 3) Rewriting of unmatched masks into new generic shuffle operations, 6494 // so the shuffle can be broken into other shuffles and the legalizer can 6495 // try the lowering again. 6496 // 6497 // The general idea is that no vector_shuffle operation should be left to 6498 // be matched during isel, all of them must be converted to a target specific 6499 // node here. 6500 6501 // Normalize the input vectors. Here splats, zeroed vectors, profitable 6502 // narrowing and commutation of operands should be handled. The actual code 6503 // doesn't include all of those, work in progress... 6504 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 6505 if (NewOp.getNode()) 6506 return NewOp; 6507 6508 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 6509 // unpckh_undef). Only use pshufd if speed is more important than size. 6510 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp, HasAVX2)) 6511 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6512 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp, HasAVX2)) 6513 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6514 6515 if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() && 6516 V2IsUndef && RelaxedMayFoldVectorLoad(V1)) 6517 return getMOVDDup(Op, dl, V1, DAG); 6518 6519 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 6520 return getMOVHighToLow(Op, dl, DAG); 6521 6522 // Use to match splats 6523 if (HasXMMInt && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef && 6524 (VT == MVT::v2f64 || VT == MVT::v2i64)) 6525 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6526 6527 if (X86::isPSHUFDMask(SVOp)) { 6528 // The actual implementation will match the mask in the if above and then 6529 // during isel it can match several different instructions, not only pshufd 6530 // as its name says, sad but true, emulate the behavior for now... 6531 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 6532 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 6533 6534 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 6535 6536 if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32)) 6537 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 6538 6539 return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1, 6540 TargetMask, DAG); 6541 } 6542 6543 // Check if this can be converted into a logical shift. 6544 bool isLeft = false; 6545 unsigned ShAmt = 0; 6546 SDValue ShVal; 6547 bool isShift = HasXMMInt && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 6548 if (isShift && ShVal.hasOneUse()) { 6549 // If the shifted value has multiple uses, it may be cheaper to use 6550 // v_set0 + movlhps or movhlps, etc. 6551 EVT EltVT = VT.getVectorElementType(); 6552 ShAmt *= EltVT.getSizeInBits(); 6553 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6554 } 6555 6556 if (X86::isMOVLMask(SVOp)) { 6557 if (ISD::isBuildVectorAllZeros(V1.getNode())) 6558 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 6559 if (!X86::isMOVLPMask(SVOp)) { 6560 if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6561 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6562 6563 if (VT == MVT::v4i32 || VT == MVT::v4f32) 6564 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6565 } 6566 } 6567 6568 // FIXME: fold these into legal mask. 6569 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp, HasAVX2)) 6570 return getMOVLowToHigh(Op, dl, DAG, HasXMMInt); 6571 6572 if (X86::isMOVHLPSMask(SVOp)) 6573 return getMOVHighToLow(Op, dl, DAG); 6574 6575 if (X86::isMOVSHDUPMask(SVOp, Subtarget)) 6576 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 6577 6578 if (X86::isMOVSLDUPMask(SVOp, Subtarget)) 6579 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 6580 6581 if (X86::isMOVLPMask(SVOp)) 6582 return getMOVLP(Op, dl, DAG, HasXMMInt); 6583 6584 if (ShouldXformToMOVHLPS(SVOp) || 6585 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 6586 return CommuteVectorShuffle(SVOp, DAG); 6587 6588 if (isShift) { 6589 // No better options. Use a vshl / vsrl. 6590 EVT EltVT = VT.getVectorElementType(); 6591 ShAmt *= EltVT.getSizeInBits(); 6592 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6593 } 6594 6595 bool Commuted = false; 6596 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6597 // 1,1,1,1 -> v8i16 though. 6598 V1IsSplat = isSplatVector(V1.getNode()); 6599 V2IsSplat = isSplatVector(V2.getNode()); 6600 6601 // Canonicalize the splat or undef, if present, to be on the RHS. 6602 if (V1IsSplat && !V2IsSplat) { 6603 Op = CommuteVectorShuffle(SVOp, DAG); 6604 SVOp = cast<ShuffleVectorSDNode>(Op); 6605 V1 = SVOp->getOperand(0); 6606 V2 = SVOp->getOperand(1); 6607 std::swap(V1IsSplat, V2IsSplat); 6608 Commuted = true; 6609 } 6610 6611 SmallVector<int, 32> M; 6612 SVOp->getMask(M); 6613 6614 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { 6615 // Shuffling low element of v1 into undef, just return v1. 6616 if (V2IsUndef) 6617 return V1; 6618 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6619 // the instruction selector will not match, so get a canonical MOVL with 6620 // swapped operands to undo the commute. 6621 return getMOVL(DAG, dl, VT, V2, V1); 6622 } 6623 6624 if (isUNPCKLMask(M, VT, HasAVX2)) 6625 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 6626 6627 if (isUNPCKHMask(M, VT, HasAVX2)) 6628 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 6629 6630 if (V2IsSplat) { 6631 // Normalize mask so all entries that point to V2 points to its first 6632 // element then try to match unpck{h|l} again. If match, return a 6633 // new vector_shuffle with the corrected mask. 6634 SDValue NewMask = NormalizeMask(SVOp, DAG); 6635 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 6636 if (NSVOp != SVOp) { 6637 if (X86::isUNPCKLMask(NSVOp, HasAVX2, true)) { 6638 return NewMask; 6639 } else if (X86::isUNPCKHMask(NSVOp, HasAVX2, true)) { 6640 return NewMask; 6641 } 6642 } 6643 } 6644 6645 if (Commuted) { 6646 // Commute is back and try unpck* again. 6647 // FIXME: this seems wrong. 6648 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 6649 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 6650 6651 if (X86::isUNPCKLMask(NewSVOp, HasAVX2)) 6652 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V2, V1, DAG); 6653 6654 if (X86::isUNPCKHMask(NewSVOp, HasAVX2)) 6655 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V2, V1, DAG); 6656 } 6657 6658 // Normalize the node to match x86 shuffle ops if needed 6659 if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) || 6660 isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true))) 6661 return CommuteVectorShuffle(SVOp, DAG); 6662 6663 // The checks below are all present in isShuffleMaskLegal, but they are 6664 // inlined here right now to enable us to directly emit target specific 6665 // nodes, and remove one by one until they don't return Op anymore. 6666 6667 if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX())) 6668 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 6669 getShufflePALIGNRImmediate(SVOp), 6670 DAG); 6671 6672 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6673 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6674 if (VT == MVT::v2f64 || VT == MVT::v2i64) 6675 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6676 } 6677 6678 if (isPSHUFHWMask(M, VT)) 6679 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6680 X86::getShufflePSHUFHWImmediate(SVOp), 6681 DAG); 6682 6683 if (isPSHUFLWMask(M, VT)) 6684 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 6685 X86::getShufflePSHUFLWImmediate(SVOp), 6686 DAG); 6687 6688 if (isSHUFPMask(M, VT)) 6689 return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, 6690 X86::getShuffleSHUFImmediate(SVOp), DAG); 6691 6692 if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) 6693 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 6694 if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) 6695 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 6696 6697 //===--------------------------------------------------------------------===// 6698 // Generate target specific nodes for 128 or 256-bit shuffles only 6699 // supported in the AVX instruction set. 6700 // 6701 6702 // Handle VMOVDDUPY permutations 6703 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX)) 6704 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 6705 6706 // Handle VPERMILPS/D* permutations 6707 if (isVPERMILPMask(M, VT, HasAVX)) 6708 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, 6709 getShuffleVPERMILPImmediate(SVOp), DAG); 6710 6711 // Handle VPERM2F128/VPERM2I128 permutations 6712 if (isVPERM2X128Mask(M, VT, HasAVX)) 6713 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, 6714 V2, getShuffleVPERM2X128Immediate(SVOp), DAG); 6715 6716 // Handle VSHUFPS/DY permutations 6717 if (isVSHUFPYMask(M, VT, HasAVX)) 6718 return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, 6719 getShuffleVSHUFPYImmediate(SVOp), DAG); 6720 6721 //===--------------------------------------------------------------------===// 6722 // Since no target specific shuffle was selected for this generic one, 6723 // lower it into other known shuffles. FIXME: this isn't true yet, but 6724 // this is the plan. 6725 // 6726 6727 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 6728 if (VT == MVT::v8i16) { 6729 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 6730 if (NewOp.getNode()) 6731 return NewOp; 6732 } 6733 6734 if (VT == MVT::v16i8) { 6735 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 6736 if (NewOp.getNode()) 6737 return NewOp; 6738 } 6739 6740 // Handle all 128-bit wide vectors with 4 elements, and match them with 6741 // several different shuffle types. 6742 if (NumElems == 4 && VT.getSizeInBits() == 128) 6743 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 6744 6745 // Handle general 256-bit shuffles 6746 if (VT.is256BitVector()) 6747 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 6748 6749 return SDValue(); 6750} 6751 6752SDValue 6753X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 6754 SelectionDAG &DAG) const { 6755 EVT VT = Op.getValueType(); 6756 DebugLoc dl = Op.getDebugLoc(); 6757 6758 if (Op.getOperand(0).getValueType().getSizeInBits() != 128) 6759 return SDValue(); 6760 6761 if (VT.getSizeInBits() == 8) { 6762 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 6763 Op.getOperand(0), Op.getOperand(1)); 6764 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6765 DAG.getValueType(VT)); 6766 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6767 } else if (VT.getSizeInBits() == 16) { 6768 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6769 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 6770 if (Idx == 0) 6771 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6772 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6773 DAG.getNode(ISD::BITCAST, dl, 6774 MVT::v4i32, 6775 Op.getOperand(0)), 6776 Op.getOperand(1))); 6777 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 6778 Op.getOperand(0), Op.getOperand(1)); 6779 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6780 DAG.getValueType(VT)); 6781 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6782 } else if (VT == MVT::f32) { 6783 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 6784 // the result back to FR32 register. It's only worth matching if the 6785 // result has a single use which is a store or a bitcast to i32. And in 6786 // the case of a store, it's not worth it if the index is a constant 0, 6787 // because a MOVSSmr can be used instead, which is smaller and faster. 6788 if (!Op.hasOneUse()) 6789 return SDValue(); 6790 SDNode *User = *Op.getNode()->use_begin(); 6791 if ((User->getOpcode() != ISD::STORE || 6792 (isa<ConstantSDNode>(Op.getOperand(1)) && 6793 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 6794 (User->getOpcode() != ISD::BITCAST || 6795 User->getValueType(0) != MVT::i32)) 6796 return SDValue(); 6797 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6798 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 6799 Op.getOperand(0)), 6800 Op.getOperand(1)); 6801 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 6802 } else if (VT == MVT::i32 || VT == MVT::i64) { 6803 // ExtractPS/pextrq works with constant index. 6804 if (isa<ConstantSDNode>(Op.getOperand(1))) 6805 return Op; 6806 } 6807 return SDValue(); 6808} 6809 6810 6811SDValue 6812X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6813 SelectionDAG &DAG) const { 6814 if (!isa<ConstantSDNode>(Op.getOperand(1))) 6815 return SDValue(); 6816 6817 SDValue Vec = Op.getOperand(0); 6818 EVT VecVT = Vec.getValueType(); 6819 6820 // If this is a 256-bit vector result, first extract the 128-bit vector and 6821 // then extract the element from the 128-bit vector. 6822 if (VecVT.getSizeInBits() == 256) { 6823 DebugLoc dl = Op.getNode()->getDebugLoc(); 6824 unsigned NumElems = VecVT.getVectorNumElements(); 6825 SDValue Idx = Op.getOperand(1); 6826 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 6827 6828 // Get the 128-bit vector. 6829 bool Upper = IdxVal >= NumElems/2; 6830 Vec = Extract128BitVector(Vec, 6831 DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl); 6832 6833 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6834 Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx); 6835 } 6836 6837 assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); 6838 6839 if (Subtarget->hasSSE41orAVX()) { 6840 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6841 if (Res.getNode()) 6842 return Res; 6843 } 6844 6845 EVT VT = Op.getValueType(); 6846 DebugLoc dl = Op.getDebugLoc(); 6847 // TODO: handle v16i8. 6848 if (VT.getSizeInBits() == 16) { 6849 SDValue Vec = Op.getOperand(0); 6850 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6851 if (Idx == 0) 6852 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6853 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6854 DAG.getNode(ISD::BITCAST, dl, 6855 MVT::v4i32, Vec), 6856 Op.getOperand(1))); 6857 // Transform it so it match pextrw which produces a 32-bit result. 6858 EVT EltVT = MVT::i32; 6859 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6860 Op.getOperand(0), Op.getOperand(1)); 6861 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6862 DAG.getValueType(VT)); 6863 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6864 } else if (VT.getSizeInBits() == 32) { 6865 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6866 if (Idx == 0) 6867 return Op; 6868 6869 // SHUFPS the element to the lowest double word, then movss. 6870 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 6871 EVT VVT = Op.getOperand(0).getValueType(); 6872 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6873 DAG.getUNDEF(VVT), Mask); 6874 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6875 DAG.getIntPtrConstant(0)); 6876 } else if (VT.getSizeInBits() == 64) { 6877 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6878 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6879 // to match extract_elt for f64. 6880 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6881 if (Idx == 0) 6882 return Op; 6883 6884 // UNPCKHPD the element to the lowest double word, then movsd. 6885 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6886 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6887 int Mask[2] = { 1, -1 }; 6888 EVT VVT = Op.getOperand(0).getValueType(); 6889 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6890 DAG.getUNDEF(VVT), Mask); 6891 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6892 DAG.getIntPtrConstant(0)); 6893 } 6894 6895 return SDValue(); 6896} 6897 6898SDValue 6899X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 6900 SelectionDAG &DAG) const { 6901 EVT VT = Op.getValueType(); 6902 EVT EltVT = VT.getVectorElementType(); 6903 DebugLoc dl = Op.getDebugLoc(); 6904 6905 SDValue N0 = Op.getOperand(0); 6906 SDValue N1 = Op.getOperand(1); 6907 SDValue N2 = Op.getOperand(2); 6908 6909 if (VT.getSizeInBits() == 256) 6910 return SDValue(); 6911 6912 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 6913 isa<ConstantSDNode>(N2)) { 6914 unsigned Opc; 6915 if (VT == MVT::v8i16) 6916 Opc = X86ISD::PINSRW; 6917 else if (VT == MVT::v16i8) 6918 Opc = X86ISD::PINSRB; 6919 else 6920 Opc = X86ISD::PINSRB; 6921 6922 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 6923 // argument. 6924 if (N1.getValueType() != MVT::i32) 6925 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6926 if (N2.getValueType() != MVT::i32) 6927 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6928 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 6929 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 6930 // Bits [7:6] of the constant are the source select. This will always be 6931 // zero here. The DAG Combiner may combine an extract_elt index into these 6932 // bits. For example (insert (extract, 3), 2) could be matched by putting 6933 // the '3' into bits [7:6] of X86ISD::INSERTPS. 6934 // Bits [5:4] of the constant are the destination select. This is the 6935 // value of the incoming immediate. 6936 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 6937 // combine either bitwise AND or insert of float 0.0 to set these bits. 6938 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 6939 // Create this as a scalar to vector.. 6940 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 6941 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 6942 } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && 6943 isa<ConstantSDNode>(N2)) { 6944 // PINSR* works with constant index. 6945 return Op; 6946 } 6947 return SDValue(); 6948} 6949 6950SDValue 6951X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 6952 EVT VT = Op.getValueType(); 6953 EVT EltVT = VT.getVectorElementType(); 6954 6955 DebugLoc dl = Op.getDebugLoc(); 6956 SDValue N0 = Op.getOperand(0); 6957 SDValue N1 = Op.getOperand(1); 6958 SDValue N2 = Op.getOperand(2); 6959 6960 // If this is a 256-bit vector result, first extract the 128-bit vector, 6961 // insert the element into the extracted half and then place it back. 6962 if (VT.getSizeInBits() == 256) { 6963 if (!isa<ConstantSDNode>(N2)) 6964 return SDValue(); 6965 6966 // Get the desired 128-bit vector half. 6967 unsigned NumElems = VT.getVectorNumElements(); 6968 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 6969 bool Upper = IdxVal >= NumElems/2; 6970 SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32); 6971 SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl); 6972 6973 // Insert the element into the desired half. 6974 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, 6975 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2); 6976 6977 // Insert the changed part back to the 256-bit vector 6978 return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); 6979 } 6980 6981 if (Subtarget->hasSSE41orAVX()) 6982 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 6983 6984 if (EltVT == MVT::i8) 6985 return SDValue(); 6986 6987 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 6988 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 6989 // as its second argument. 6990 if (N1.getValueType() != MVT::i32) 6991 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6992 if (N2.getValueType() != MVT::i32) 6993 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6994 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 6995 } 6996 return SDValue(); 6997} 6998 6999SDValue 7000X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 7001 LLVMContext *Context = DAG.getContext(); 7002 DebugLoc dl = Op.getDebugLoc(); 7003 EVT OpVT = Op.getValueType(); 7004 7005 // If this is a 256-bit vector result, first insert into a 128-bit 7006 // vector and then insert into the 256-bit vector. 7007 if (OpVT.getSizeInBits() > 128) { 7008 // Insert into a 128-bit vector. 7009 EVT VT128 = EVT::getVectorVT(*Context, 7010 OpVT.getVectorElementType(), 7011 OpVT.getVectorNumElements() / 2); 7012 7013 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 7014 7015 // Insert the 128-bit vector. 7016 return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, 7017 DAG.getConstant(0, MVT::i32), 7018 DAG, dl); 7019 } 7020 7021 if (Op.getValueType() == MVT::v1i64 && 7022 Op.getOperand(0).getValueType() == MVT::i64) 7023 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 7024 7025 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 7026 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 7027 "Expected an SSE type!"); 7028 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 7029 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 7030} 7031 7032// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 7033// a simple subregister reference or explicit instructions to grab 7034// upper bits of a vector. 7035SDValue 7036X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 7037 if (Subtarget->hasAVX()) { 7038 DebugLoc dl = Op.getNode()->getDebugLoc(); 7039 SDValue Vec = Op.getNode()->getOperand(0); 7040 SDValue Idx = Op.getNode()->getOperand(1); 7041 7042 if (Op.getNode()->getValueType(0).getSizeInBits() == 128 7043 && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { 7044 return Extract128BitVector(Vec, Idx, DAG, dl); 7045 } 7046 } 7047 return SDValue(); 7048} 7049 7050// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 7051// simple superregister reference or explicit instructions to insert 7052// the upper bits of a vector. 7053SDValue 7054X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 7055 if (Subtarget->hasAVX()) { 7056 DebugLoc dl = Op.getNode()->getDebugLoc(); 7057 SDValue Vec = Op.getNode()->getOperand(0); 7058 SDValue SubVec = Op.getNode()->getOperand(1); 7059 SDValue Idx = Op.getNode()->getOperand(2); 7060 7061 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 7062 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 7063 return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); 7064 } 7065 } 7066 return SDValue(); 7067} 7068 7069// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 7070// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 7071// one of the above mentioned nodes. It has to be wrapped because otherwise 7072// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 7073// be used to form addressing mode. These wrapped nodes will be selected 7074// into MOV32ri. 7075SDValue 7076X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 7077 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 7078 7079 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7080 // global base reg. 7081 unsigned char OpFlag = 0; 7082 unsigned WrapperKind = X86ISD::Wrapper; 7083 CodeModel::Model M = getTargetMachine().getCodeModel(); 7084 7085 if (Subtarget->isPICStyleRIPRel() && 7086 (M == CodeModel::Small || M == CodeModel::Kernel)) 7087 WrapperKind = X86ISD::WrapperRIP; 7088 else if (Subtarget->isPICStyleGOT()) 7089 OpFlag = X86II::MO_GOTOFF; 7090 else if (Subtarget->isPICStyleStubPIC()) 7091 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7092 7093 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 7094 CP->getAlignment(), 7095 CP->getOffset(), OpFlag); 7096 DebugLoc DL = CP->getDebugLoc(); 7097 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7098 // With PIC, the address is actually $g + Offset. 7099 if (OpFlag) { 7100 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7101 DAG.getNode(X86ISD::GlobalBaseReg, 7102 DebugLoc(), getPointerTy()), 7103 Result); 7104 } 7105 7106 return Result; 7107} 7108 7109SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 7110 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 7111 7112 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7113 // global base reg. 7114 unsigned char OpFlag = 0; 7115 unsigned WrapperKind = X86ISD::Wrapper; 7116 CodeModel::Model M = getTargetMachine().getCodeModel(); 7117 7118 if (Subtarget->isPICStyleRIPRel() && 7119 (M == CodeModel::Small || M == CodeModel::Kernel)) 7120 WrapperKind = X86ISD::WrapperRIP; 7121 else if (Subtarget->isPICStyleGOT()) 7122 OpFlag = X86II::MO_GOTOFF; 7123 else if (Subtarget->isPICStyleStubPIC()) 7124 OpFlag = X86II::MO_PIC_BASE_OFFSET; 7125 7126 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 7127 OpFlag); 7128 DebugLoc DL = JT->getDebugLoc(); 7129 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7130 7131 // With PIC, the address is actually $g + Offset. 7132 if (OpFlag) 7133 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7134 DAG.getNode(X86ISD::GlobalBaseReg, 7135 DebugLoc(), getPointerTy()), 7136 Result); 7137 7138 return Result; 7139} 7140 7141SDValue 7142X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 7143 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 7144 7145 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7146 // global base reg. 7147 unsigned char OpFlag = 0; 7148 unsigned WrapperKind = X86ISD::Wrapper; 7149 CodeModel::Model M = getTargetMachine().getCodeModel(); 7150 7151 if (Subtarget->isPICStyleRIPRel() && 7152 (M == CodeModel::Small || M == CodeModel::Kernel)) { 7153 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 7154 OpFlag = X86II::MO_GOTPCREL; 7155 WrapperKind = X86ISD::WrapperRIP; 7156 } else if (Subtarget->isPICStyleGOT()) { 7157 OpFlag = X86II::MO_GOT; 7158 } else if (Subtarget->isPICStyleStubPIC()) { 7159 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 7160 } else if (Subtarget->isPICStyleStubNoDynamic()) { 7161 OpFlag = X86II::MO_DARWIN_NONLAZY; 7162 } 7163 7164 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 7165 7166 DebugLoc DL = Op.getDebugLoc(); 7167 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7168 7169 7170 // With PIC, the address is actually $g + Offset. 7171 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 7172 !Subtarget->is64Bit()) { 7173 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7174 DAG.getNode(X86ISD::GlobalBaseReg, 7175 DebugLoc(), getPointerTy()), 7176 Result); 7177 } 7178 7179 // For symbols that require a load from a stub to get the address, emit the 7180 // load. 7181 if (isGlobalStubReference(OpFlag)) 7182 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 7183 MachinePointerInfo::getGOT(), false, false, false, 0); 7184 7185 return Result; 7186} 7187 7188SDValue 7189X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 7190 // Create the TargetBlockAddressAddress node. 7191 unsigned char OpFlags = 7192 Subtarget->ClassifyBlockAddressReference(); 7193 CodeModel::Model M = getTargetMachine().getCodeModel(); 7194 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 7195 DebugLoc dl = Op.getDebugLoc(); 7196 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 7197 /*isTarget=*/true, OpFlags); 7198 7199 if (Subtarget->isPICStyleRIPRel() && 7200 (M == CodeModel::Small || M == CodeModel::Kernel)) 7201 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7202 else 7203 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7204 7205 // With PIC, the address is actually $g + Offset. 7206 if (isGlobalRelativeToPICBase(OpFlags)) { 7207 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7208 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7209 Result); 7210 } 7211 7212 return Result; 7213} 7214 7215SDValue 7216X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 7217 int64_t Offset, 7218 SelectionDAG &DAG) const { 7219 // Create the TargetGlobalAddress node, folding in the constant 7220 // offset if it is legal. 7221 unsigned char OpFlags = 7222 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 7223 CodeModel::Model M = getTargetMachine().getCodeModel(); 7224 SDValue Result; 7225 if (OpFlags == X86II::MO_NO_FLAG && 7226 X86::isOffsetSuitableForCodeModel(Offset, M)) { 7227 // A direct static reference to a global. 7228 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 7229 Offset = 0; 7230 } else { 7231 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 7232 } 7233 7234 if (Subtarget->isPICStyleRIPRel() && 7235 (M == CodeModel::Small || M == CodeModel::Kernel)) 7236 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 7237 else 7238 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 7239 7240 // With PIC, the address is actually $g + Offset. 7241 if (isGlobalRelativeToPICBase(OpFlags)) { 7242 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7243 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 7244 Result); 7245 } 7246 7247 // For globals that require a load from a stub to get the address, emit the 7248 // load. 7249 if (isGlobalStubReference(OpFlags)) 7250 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 7251 MachinePointerInfo::getGOT(), false, false, false, 0); 7252 7253 // If there was a non-zero offset that we didn't fold, create an explicit 7254 // addition for it. 7255 if (Offset != 0) 7256 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 7257 DAG.getConstant(Offset, getPointerTy())); 7258 7259 return Result; 7260} 7261 7262SDValue 7263X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 7264 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 7265 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 7266 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 7267} 7268 7269static SDValue 7270GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 7271 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 7272 unsigned char OperandFlags) { 7273 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7274 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7275 DebugLoc dl = GA->getDebugLoc(); 7276 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7277 GA->getValueType(0), 7278 GA->getOffset(), 7279 OperandFlags); 7280 if (InFlag) { 7281 SDValue Ops[] = { Chain, TGA, *InFlag }; 7282 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 7283 } else { 7284 SDValue Ops[] = { Chain, TGA }; 7285 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 7286 } 7287 7288 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 7289 MFI->setAdjustsStack(true); 7290 7291 SDValue Flag = Chain.getValue(1); 7292 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 7293} 7294 7295// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 7296static SDValue 7297LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7298 const EVT PtrVT) { 7299 SDValue InFlag; 7300 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 7301 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 7302 DAG.getNode(X86ISD::GlobalBaseReg, 7303 DebugLoc(), PtrVT), InFlag); 7304 InFlag = Chain.getValue(1); 7305 7306 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 7307} 7308 7309// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 7310static SDValue 7311LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7312 const EVT PtrVT) { 7313 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 7314 X86::RAX, X86II::MO_TLSGD); 7315} 7316 7317// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 7318// "local exec" model. 7319static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 7320 const EVT PtrVT, TLSModel::Model model, 7321 bool is64Bit) { 7322 DebugLoc dl = GA->getDebugLoc(); 7323 7324 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 7325 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 7326 is64Bit ? 257 : 256)); 7327 7328 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 7329 DAG.getIntPtrConstant(0), 7330 MachinePointerInfo(Ptr), 7331 false, false, false, 0); 7332 7333 unsigned char OperandFlags = 0; 7334 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 7335 // initialexec. 7336 unsigned WrapperKind = X86ISD::Wrapper; 7337 if (model == TLSModel::LocalExec) { 7338 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 7339 } else if (is64Bit) { 7340 assert(model == TLSModel::InitialExec); 7341 OperandFlags = X86II::MO_GOTTPOFF; 7342 WrapperKind = X86ISD::WrapperRIP; 7343 } else { 7344 assert(model == TLSModel::InitialExec); 7345 OperandFlags = X86II::MO_INDNTPOFF; 7346 } 7347 7348 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 7349 // exec) 7350 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 7351 GA->getValueType(0), 7352 GA->getOffset(), OperandFlags); 7353 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 7354 7355 if (model == TLSModel::InitialExec) 7356 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 7357 MachinePointerInfo::getGOT(), false, false, false, 0); 7358 7359 // The address of the thread local variable is the add of the thread 7360 // pointer with the offset of the variable. 7361 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 7362} 7363 7364SDValue 7365X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 7366 7367 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 7368 const GlobalValue *GV = GA->getGlobal(); 7369 7370 if (Subtarget->isTargetELF()) { 7371 // TODO: implement the "local dynamic" model 7372 // TODO: implement the "initial exec"model for pic executables 7373 7374 // If GV is an alias then use the aliasee for determining 7375 // thread-localness. 7376 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 7377 GV = GA->resolveAliasedGlobal(false); 7378 7379 TLSModel::Model model 7380 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 7381 7382 switch (model) { 7383 case TLSModel::GeneralDynamic: 7384 case TLSModel::LocalDynamic: // not implemented 7385 if (Subtarget->is64Bit()) 7386 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 7387 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 7388 7389 case TLSModel::InitialExec: 7390 case TLSModel::LocalExec: 7391 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 7392 Subtarget->is64Bit()); 7393 } 7394 } else if (Subtarget->isTargetDarwin()) { 7395 // Darwin only has one model of TLS. Lower to that. 7396 unsigned char OpFlag = 0; 7397 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 7398 X86ISD::WrapperRIP : X86ISD::Wrapper; 7399 7400 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 7401 // global base reg. 7402 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 7403 !Subtarget->is64Bit(); 7404 if (PIC32) 7405 OpFlag = X86II::MO_TLVP_PIC_BASE; 7406 else 7407 OpFlag = X86II::MO_TLVP; 7408 DebugLoc DL = Op.getDebugLoc(); 7409 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 7410 GA->getValueType(0), 7411 GA->getOffset(), OpFlag); 7412 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 7413 7414 // With PIC32, the address is actually $g + Offset. 7415 if (PIC32) 7416 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7417 DAG.getNode(X86ISD::GlobalBaseReg, 7418 DebugLoc(), getPointerTy()), 7419 Offset); 7420 7421 // Lowering the machine isd will make sure everything is in the right 7422 // location. 7423 SDValue Chain = DAG.getEntryNode(); 7424 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7425 SDValue Args[] = { Chain, Offset }; 7426 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 7427 7428 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 7429 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7430 MFI->setAdjustsStack(true); 7431 7432 // And our return value (tls address) is in the standard call return value 7433 // location. 7434 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 7435 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), 7436 Chain.getValue(1)); 7437 } 7438 7439 assert(false && 7440 "TLS not implemented for this target."); 7441 7442 llvm_unreachable("Unreachable"); 7443 return SDValue(); 7444} 7445 7446 7447/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and 7448/// take a 2 x i32 value to shift plus a shift amount. 7449SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { 7450 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 7451 EVT VT = Op.getValueType(); 7452 unsigned VTBits = VT.getSizeInBits(); 7453 DebugLoc dl = Op.getDebugLoc(); 7454 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 7455 SDValue ShOpLo = Op.getOperand(0); 7456 SDValue ShOpHi = Op.getOperand(1); 7457 SDValue ShAmt = Op.getOperand(2); 7458 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 7459 DAG.getConstant(VTBits - 1, MVT::i8)) 7460 : DAG.getConstant(0, VT); 7461 7462 SDValue Tmp2, Tmp3; 7463 if (Op.getOpcode() == ISD::SHL_PARTS) { 7464 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 7465 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 7466 } else { 7467 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 7468 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 7469 } 7470 7471 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 7472 DAG.getConstant(VTBits, MVT::i8)); 7473 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 7474 AndNode, DAG.getConstant(0, MVT::i8)); 7475 7476 SDValue Hi, Lo; 7477 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7478 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 7479 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 7480 7481 if (Op.getOpcode() == ISD::SHL_PARTS) { 7482 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7483 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7484 } else { 7485 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 7486 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 7487 } 7488 7489 SDValue Ops[2] = { Lo, Hi }; 7490 return DAG.getMergeValues(Ops, 2, dl); 7491} 7492 7493SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 7494 SelectionDAG &DAG) const { 7495 EVT SrcVT = Op.getOperand(0).getValueType(); 7496 7497 if (SrcVT.isVector()) 7498 return SDValue(); 7499 7500 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 7501 "Unknown SINT_TO_FP to lower!"); 7502 7503 // These are really Legal; return the operand so the caller accepts it as 7504 // Legal. 7505 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 7506 return Op; 7507 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 7508 Subtarget->is64Bit()) { 7509 return Op; 7510 } 7511 7512 DebugLoc dl = Op.getDebugLoc(); 7513 unsigned Size = SrcVT.getSizeInBits()/8; 7514 MachineFunction &MF = DAG.getMachineFunction(); 7515 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 7516 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7517 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7518 StackSlot, 7519 MachinePointerInfo::getFixedStack(SSFI), 7520 false, false, 0); 7521 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 7522} 7523 7524SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 7525 SDValue StackSlot, 7526 SelectionDAG &DAG) const { 7527 // Build the FILD 7528 DebugLoc DL = Op.getDebugLoc(); 7529 SDVTList Tys; 7530 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 7531 if (useSSE) 7532 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 7533 else 7534 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 7535 7536 unsigned ByteSize = SrcVT.getSizeInBits()/8; 7537 7538 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 7539 MachineMemOperand *MMO; 7540 if (FI) { 7541 int SSFI = FI->getIndex(); 7542 MMO = 7543 DAG.getMachineFunction() 7544 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7545 MachineMemOperand::MOLoad, ByteSize, ByteSize); 7546 } else { 7547 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 7548 StackSlot = StackSlot.getOperand(1); 7549 } 7550 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 7551 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 7552 X86ISD::FILD, DL, 7553 Tys, Ops, array_lengthof(Ops), 7554 SrcVT, MMO); 7555 7556 if (useSSE) { 7557 Chain = Result.getValue(1); 7558 SDValue InFlag = Result.getValue(2); 7559 7560 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 7561 // shouldn't be necessary except that RFP cannot be live across 7562 // multiple blocks. When stackifier is fixed, they can be uncoupled. 7563 MachineFunction &MF = DAG.getMachineFunction(); 7564 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 7565 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 7566 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7567 Tys = DAG.getVTList(MVT::Other); 7568 SDValue Ops[] = { 7569 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 7570 }; 7571 MachineMemOperand *MMO = 7572 DAG.getMachineFunction() 7573 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7574 MachineMemOperand::MOStore, SSFISize, SSFISize); 7575 7576 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 7577 Ops, array_lengthof(Ops), 7578 Op.getValueType(), MMO); 7579 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 7580 MachinePointerInfo::getFixedStack(SSFI), 7581 false, false, false, 0); 7582 } 7583 7584 return Result; 7585} 7586 7587// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 7588SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 7589 SelectionDAG &DAG) const { 7590 // This algorithm is not obvious. Here it is in C code, more or less: 7591 /* 7592 double uint64_to_double( uint32_t hi, uint32_t lo ) { 7593 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 7594 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 7595 7596 // Copy ints to xmm registers. 7597 __m128i xh = _mm_cvtsi32_si128( hi ); 7598 __m128i xl = _mm_cvtsi32_si128( lo ); 7599 7600 // Combine into low half of a single xmm register. 7601 __m128i x = _mm_unpacklo_epi32( xh, xl ); 7602 __m128d d; 7603 double sd; 7604 7605 // Merge in appropriate exponents to give the integer bits the right 7606 // magnitude. 7607 x = _mm_unpacklo_epi32( x, exp ); 7608 7609 // Subtract away the biases to deal with the IEEE-754 double precision 7610 // implicit 1. 7611 d = _mm_sub_pd( (__m128d) x, bias ); 7612 7613 // All conversions up to here are exact. The correctly rounded result is 7614 // calculated using the current rounding mode using the following 7615 // horizontal add. 7616 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 7617 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 7618 // store doesn't really need to be here (except 7619 // maybe to zero the other double) 7620 return sd; 7621 } 7622 */ 7623 7624 DebugLoc dl = Op.getDebugLoc(); 7625 LLVMContext *Context = DAG.getContext(); 7626 7627 // Build some magic constants. 7628 SmallVector<Constant*,4> CV0; 7629 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 7630 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 7631 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7632 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7633 Constant *C0 = ConstantVector::get(CV0); 7634 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 7635 7636 SmallVector<Constant*,2> CV1; 7637 CV1.push_back( 7638 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 7639 CV1.push_back( 7640 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 7641 Constant *C1 = ConstantVector::get(CV1); 7642 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 7643 7644 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7645 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7646 Op.getOperand(0), 7647 DAG.getIntPtrConstant(1))); 7648 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7649 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7650 Op.getOperand(0), 7651 DAG.getIntPtrConstant(0))); 7652 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 7653 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 7654 MachinePointerInfo::getConstantPool(), 7655 false, false, false, 16); 7656 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 7657 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 7658 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 7659 MachinePointerInfo::getConstantPool(), 7660 false, false, false, 16); 7661 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 7662 7663 // Add the halves; easiest way is to swap them into another reg first. 7664 int ShufMask[2] = { 1, -1 }; 7665 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 7666 DAG.getUNDEF(MVT::v2f64), ShufMask); 7667 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 7668 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 7669 DAG.getIntPtrConstant(0)); 7670} 7671 7672// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 7673SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 7674 SelectionDAG &DAG) const { 7675 DebugLoc dl = Op.getDebugLoc(); 7676 // FP constant to bias correct the final result. 7677 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 7678 MVT::f64); 7679 7680 // Load the 32-bit value into an XMM register. 7681 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7682 Op.getOperand(0)); 7683 7684 // Zero out the upper parts of the register. 7685 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(), 7686 DAG); 7687 7688 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7689 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 7690 DAG.getIntPtrConstant(0)); 7691 7692 // Or the load with the bias. 7693 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 7694 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7695 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7696 MVT::v2f64, Load)), 7697 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7698 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7699 MVT::v2f64, Bias))); 7700 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7701 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 7702 DAG.getIntPtrConstant(0)); 7703 7704 // Subtract the bias. 7705 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 7706 7707 // Handle final rounding. 7708 EVT DestVT = Op.getValueType(); 7709 7710 if (DestVT.bitsLT(MVT::f64)) { 7711 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 7712 DAG.getIntPtrConstant(0)); 7713 } else if (DestVT.bitsGT(MVT::f64)) { 7714 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 7715 } 7716 7717 // Handle final rounding. 7718 return Sub; 7719} 7720 7721SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 7722 SelectionDAG &DAG) const { 7723 SDValue N0 = Op.getOperand(0); 7724 DebugLoc dl = Op.getDebugLoc(); 7725 7726 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 7727 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 7728 // the optimization here. 7729 if (DAG.SignBitIsZero(N0)) 7730 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 7731 7732 EVT SrcVT = N0.getValueType(); 7733 EVT DstVT = Op.getValueType(); 7734 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 7735 return LowerUINT_TO_FP_i64(Op, DAG); 7736 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 7737 return LowerUINT_TO_FP_i32(Op, DAG); 7738 7739 // Make a 64-bit buffer, and use it to build an FILD. 7740 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 7741 if (SrcVT == MVT::i32) { 7742 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 7743 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 7744 getPointerTy(), StackSlot, WordOff); 7745 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7746 StackSlot, MachinePointerInfo(), 7747 false, false, 0); 7748 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 7749 OffsetSlot, MachinePointerInfo(), 7750 false, false, 0); 7751 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 7752 return Fild; 7753 } 7754 7755 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 7756 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7757 StackSlot, MachinePointerInfo(), 7758 false, false, 0); 7759 // For i64 source, we need to add the appropriate power of 2 if the input 7760 // was negative. This is the same as the optimization in 7761 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 7762 // we must be careful to do the computation in x87 extended precision, not 7763 // in SSE. (The generic code can't know it's OK to do this, or how to.) 7764 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 7765 MachineMemOperand *MMO = 7766 DAG.getMachineFunction() 7767 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7768 MachineMemOperand::MOLoad, 8, 8); 7769 7770 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 7771 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 7772 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 7773 MVT::i64, MMO); 7774 7775 APInt FF(32, 0x5F800000ULL); 7776 7777 // Check whether the sign bit is set. 7778 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 7779 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 7780 ISD::SETLT); 7781 7782 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 7783 SDValue FudgePtr = DAG.getConstantPool( 7784 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 7785 getPointerTy()); 7786 7787 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 7788 SDValue Zero = DAG.getIntPtrConstant(0); 7789 SDValue Four = DAG.getIntPtrConstant(4); 7790 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 7791 Zero, Four); 7792 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 7793 7794 // Load the value out, extending it from f32 to f80. 7795 // FIXME: Avoid the extend by constructing the right constant pool? 7796 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 7797 FudgePtr, MachinePointerInfo::getConstantPool(), 7798 MVT::f32, false, false, 4); 7799 // Extend everything to 80 bits to force it to be done on x87. 7800 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 7801 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 7802} 7803 7804std::pair<SDValue,SDValue> X86TargetLowering:: 7805FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 7806 DebugLoc DL = Op.getDebugLoc(); 7807 7808 EVT DstTy = Op.getValueType(); 7809 7810 if (!IsSigned) { 7811 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 7812 DstTy = MVT::i64; 7813 } 7814 7815 assert(DstTy.getSimpleVT() <= MVT::i64 && 7816 DstTy.getSimpleVT() >= MVT::i16 && 7817 "Unknown FP_TO_SINT to lower!"); 7818 7819 // These are really Legal. 7820 if (DstTy == MVT::i32 && 7821 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7822 return std::make_pair(SDValue(), SDValue()); 7823 if (Subtarget->is64Bit() && 7824 DstTy == MVT::i64 && 7825 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7826 return std::make_pair(SDValue(), SDValue()); 7827 7828 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 7829 // stack slot. 7830 MachineFunction &MF = DAG.getMachineFunction(); 7831 unsigned MemSize = DstTy.getSizeInBits()/8; 7832 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7833 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7834 7835 7836 7837 unsigned Opc; 7838 switch (DstTy.getSimpleVT().SimpleTy) { 7839 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 7840 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 7841 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 7842 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 7843 } 7844 7845 SDValue Chain = DAG.getEntryNode(); 7846 SDValue Value = Op.getOperand(0); 7847 EVT TheVT = Op.getOperand(0).getValueType(); 7848 if (isScalarFPTypeInSSEReg(TheVT)) { 7849 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 7850 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 7851 MachinePointerInfo::getFixedStack(SSFI), 7852 false, false, 0); 7853 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 7854 SDValue Ops[] = { 7855 Chain, StackSlot, DAG.getValueType(TheVT) 7856 }; 7857 7858 MachineMemOperand *MMO = 7859 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7860 MachineMemOperand::MOLoad, MemSize, MemSize); 7861 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 7862 DstTy, MMO); 7863 Chain = Value.getValue(1); 7864 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7865 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7866 } 7867 7868 MachineMemOperand *MMO = 7869 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7870 MachineMemOperand::MOStore, MemSize, MemSize); 7871 7872 // Build the FP_TO_INT*_IN_MEM 7873 SDValue Ops[] = { Chain, Value, StackSlot }; 7874 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 7875 Ops, 3, DstTy, MMO); 7876 7877 return std::make_pair(FIST, StackSlot); 7878} 7879 7880SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 7881 SelectionDAG &DAG) const { 7882 if (Op.getValueType().isVector()) 7883 return SDValue(); 7884 7885 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 7886 SDValue FIST = Vals.first, StackSlot = Vals.second; 7887 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 7888 if (FIST.getNode() == 0) return Op; 7889 7890 // Load the result. 7891 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7892 FIST, StackSlot, MachinePointerInfo(), 7893 false, false, false, 0); 7894} 7895 7896SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 7897 SelectionDAG &DAG) const { 7898 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 7899 SDValue FIST = Vals.first, StackSlot = Vals.second; 7900 assert(FIST.getNode() && "Unexpected failure"); 7901 7902 // Load the result. 7903 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7904 FIST, StackSlot, MachinePointerInfo(), 7905 false, false, false, 0); 7906} 7907 7908SDValue X86TargetLowering::LowerFABS(SDValue Op, 7909 SelectionDAG &DAG) const { 7910 LLVMContext *Context = DAG.getContext(); 7911 DebugLoc dl = Op.getDebugLoc(); 7912 EVT VT = Op.getValueType(); 7913 EVT EltVT = VT; 7914 if (VT.isVector()) 7915 EltVT = VT.getVectorElementType(); 7916 SmallVector<Constant*,4> CV; 7917 if (EltVT == MVT::f64) { 7918 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 7919 CV.assign(2, C); 7920 } else { 7921 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 7922 CV.assign(4, C); 7923 } 7924 Constant *C = ConstantVector::get(CV); 7925 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7926 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7927 MachinePointerInfo::getConstantPool(), 7928 false, false, false, 16); 7929 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 7930} 7931 7932SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 7933 LLVMContext *Context = DAG.getContext(); 7934 DebugLoc dl = Op.getDebugLoc(); 7935 EVT VT = Op.getValueType(); 7936 EVT EltVT = VT; 7937 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 7938 if (VT.isVector()) { 7939 EltVT = VT.getVectorElementType(); 7940 NumElts = VT.getVectorNumElements(); 7941 } 7942 SmallVector<Constant*,8> CV; 7943 if (EltVT == MVT::f64) { 7944 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 7945 CV.assign(NumElts, C); 7946 } else { 7947 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 7948 CV.assign(NumElts, C); 7949 } 7950 Constant *C = ConstantVector::get(CV); 7951 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7952 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7953 MachinePointerInfo::getConstantPool(), 7954 false, false, false, 16); 7955 if (VT.isVector()) { 7956 MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64; 7957 return DAG.getNode(ISD::BITCAST, dl, VT, 7958 DAG.getNode(ISD::XOR, dl, XORVT, 7959 DAG.getNode(ISD::BITCAST, dl, XORVT, 7960 Op.getOperand(0)), 7961 DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); 7962 } else { 7963 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 7964 } 7965} 7966 7967SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 7968 LLVMContext *Context = DAG.getContext(); 7969 SDValue Op0 = Op.getOperand(0); 7970 SDValue Op1 = Op.getOperand(1); 7971 DebugLoc dl = Op.getDebugLoc(); 7972 EVT VT = Op.getValueType(); 7973 EVT SrcVT = Op1.getValueType(); 7974 7975 // If second operand is smaller, extend it first. 7976 if (SrcVT.bitsLT(VT)) { 7977 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 7978 SrcVT = VT; 7979 } 7980 // And if it is bigger, shrink it first. 7981 if (SrcVT.bitsGT(VT)) { 7982 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 7983 SrcVT = VT; 7984 } 7985 7986 // At this point the operands and the result should have the same 7987 // type, and that won't be f80 since that is not custom lowered. 7988 7989 // First get the sign bit of second operand. 7990 SmallVector<Constant*,4> CV; 7991 if (SrcVT == MVT::f64) { 7992 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 7993 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7994 } else { 7995 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 7996 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7997 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7998 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7999 } 8000 Constant *C = ConstantVector::get(CV); 8001 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8002 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 8003 MachinePointerInfo::getConstantPool(), 8004 false, false, false, 16); 8005 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 8006 8007 // Shift sign bit right or left if the two operands have different types. 8008 if (SrcVT.bitsGT(VT)) { 8009 // Op0 is MVT::f32, Op1 is MVT::f64. 8010 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 8011 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 8012 DAG.getConstant(32, MVT::i32)); 8013 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 8014 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 8015 DAG.getIntPtrConstant(0)); 8016 } 8017 8018 // Clear first operand sign bit. 8019 CV.clear(); 8020 if (VT == MVT::f64) { 8021 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 8022 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 8023 } else { 8024 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 8025 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8026 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8027 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 8028 } 8029 C = ConstantVector::get(CV); 8030 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8031 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8032 MachinePointerInfo::getConstantPool(), 8033 false, false, false, 16); 8034 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 8035 8036 // Or the value with the sign bit. 8037 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 8038} 8039 8040SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { 8041 SDValue N0 = Op.getOperand(0); 8042 DebugLoc dl = Op.getDebugLoc(); 8043 EVT VT = Op.getValueType(); 8044 8045 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 8046 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 8047 DAG.getConstant(1, VT)); 8048 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 8049} 8050 8051/// Emit nodes that will be selected as "test Op0,Op0", or something 8052/// equivalent. 8053SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 8054 SelectionDAG &DAG) const { 8055 DebugLoc dl = Op.getDebugLoc(); 8056 8057 // CF and OF aren't always set the way we want. Determine which 8058 // of these we need. 8059 bool NeedCF = false; 8060 bool NeedOF = false; 8061 switch (X86CC) { 8062 default: break; 8063 case X86::COND_A: case X86::COND_AE: 8064 case X86::COND_B: case X86::COND_BE: 8065 NeedCF = true; 8066 break; 8067 case X86::COND_G: case X86::COND_GE: 8068 case X86::COND_L: case X86::COND_LE: 8069 case X86::COND_O: case X86::COND_NO: 8070 NeedOF = true; 8071 break; 8072 } 8073 8074 // See if we can use the EFLAGS value from the operand instead of 8075 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 8076 // we prove that the arithmetic won't overflow, we can't use OF or CF. 8077 if (Op.getResNo() != 0 || NeedOF || NeedCF) 8078 // Emit a CMP with 0, which is the TEST pattern. 8079 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8080 DAG.getConstant(0, Op.getValueType())); 8081 8082 unsigned Opcode = 0; 8083 unsigned NumOperands = 0; 8084 switch (Op.getNode()->getOpcode()) { 8085 case ISD::ADD: 8086 // Due to an isel shortcoming, be conservative if this add is likely to be 8087 // selected as part of a load-modify-store instruction. When the root node 8088 // in a match is a store, isel doesn't know how to remap non-chain non-flag 8089 // uses of other nodes in the match, such as the ADD in this case. This 8090 // leads to the ADD being left around and reselected, with the result being 8091 // two adds in the output. Alas, even if none our users are stores, that 8092 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 8093 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 8094 // climbing the DAG back to the root, and it doesn't seem to be worth the 8095 // effort. 8096 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8097 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8098 if (UI->getOpcode() != ISD::CopyToReg && 8099 UI->getOpcode() != ISD::SETCC && 8100 UI->getOpcode() != ISD::STORE) 8101 goto default_case; 8102 8103 if (ConstantSDNode *C = 8104 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 8105 // An add of one will be selected as an INC. 8106 if (C->getAPIntValue() == 1) { 8107 Opcode = X86ISD::INC; 8108 NumOperands = 1; 8109 break; 8110 } 8111 8112 // An add of negative one (subtract of one) will be selected as a DEC. 8113 if (C->getAPIntValue().isAllOnesValue()) { 8114 Opcode = X86ISD::DEC; 8115 NumOperands = 1; 8116 break; 8117 } 8118 } 8119 8120 // Otherwise use a regular EFLAGS-setting add. 8121 Opcode = X86ISD::ADD; 8122 NumOperands = 2; 8123 break; 8124 case ISD::AND: { 8125 // If the primary and result isn't used, don't bother using X86ISD::AND, 8126 // because a TEST instruction will be better. 8127 bool NonFlagUse = false; 8128 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8129 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 8130 SDNode *User = *UI; 8131 unsigned UOpNo = UI.getOperandNo(); 8132 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 8133 // Look pass truncate. 8134 UOpNo = User->use_begin().getOperandNo(); 8135 User = *User->use_begin(); 8136 } 8137 8138 if (User->getOpcode() != ISD::BRCOND && 8139 User->getOpcode() != ISD::SETCC && 8140 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 8141 NonFlagUse = true; 8142 break; 8143 } 8144 } 8145 8146 if (!NonFlagUse) 8147 break; 8148 } 8149 // FALL THROUGH 8150 case ISD::SUB: 8151 case ISD::OR: 8152 case ISD::XOR: 8153 // Due to the ISEL shortcoming noted above, be conservative if this op is 8154 // likely to be selected as part of a load-modify-store instruction. 8155 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 8156 UE = Op.getNode()->use_end(); UI != UE; ++UI) 8157 if (UI->getOpcode() == ISD::STORE) 8158 goto default_case; 8159 8160 // Otherwise use a regular EFLAGS-setting instruction. 8161 switch (Op.getNode()->getOpcode()) { 8162 default: llvm_unreachable("unexpected operator!"); 8163 case ISD::SUB: Opcode = X86ISD::SUB; break; 8164 case ISD::OR: Opcode = X86ISD::OR; break; 8165 case ISD::XOR: Opcode = X86ISD::XOR; break; 8166 case ISD::AND: Opcode = X86ISD::AND; break; 8167 } 8168 8169 NumOperands = 2; 8170 break; 8171 case X86ISD::ADD: 8172 case X86ISD::SUB: 8173 case X86ISD::INC: 8174 case X86ISD::DEC: 8175 case X86ISD::OR: 8176 case X86ISD::XOR: 8177 case X86ISD::AND: 8178 return SDValue(Op.getNode(), 1); 8179 default: 8180 default_case: 8181 break; 8182 } 8183 8184 if (Opcode == 0) 8185 // Emit a CMP with 0, which is the TEST pattern. 8186 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 8187 DAG.getConstant(0, Op.getValueType())); 8188 8189 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 8190 SmallVector<SDValue, 4> Ops; 8191 for (unsigned i = 0; i != NumOperands; ++i) 8192 Ops.push_back(Op.getOperand(i)); 8193 8194 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 8195 DAG.ReplaceAllUsesWith(Op, New); 8196 return SDValue(New.getNode(), 1); 8197} 8198 8199/// Emit nodes that will be selected as "cmp Op0,Op1", or something 8200/// equivalent. 8201SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 8202 SelectionDAG &DAG) const { 8203 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 8204 if (C->getAPIntValue() == 0) 8205 return EmitTest(Op0, X86CC, DAG); 8206 8207 DebugLoc dl = Op0.getDebugLoc(); 8208 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 8209} 8210 8211/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 8212/// if it's possible. 8213SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 8214 DebugLoc dl, SelectionDAG &DAG) const { 8215 SDValue Op0 = And.getOperand(0); 8216 SDValue Op1 = And.getOperand(1); 8217 if (Op0.getOpcode() == ISD::TRUNCATE) 8218 Op0 = Op0.getOperand(0); 8219 if (Op1.getOpcode() == ISD::TRUNCATE) 8220 Op1 = Op1.getOperand(0); 8221 8222 SDValue LHS, RHS; 8223 if (Op1.getOpcode() == ISD::SHL) 8224 std::swap(Op0, Op1); 8225 if (Op0.getOpcode() == ISD::SHL) { 8226 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 8227 if (And00C->getZExtValue() == 1) { 8228 // If we looked past a truncate, check that it's only truncating away 8229 // known zeros. 8230 unsigned BitWidth = Op0.getValueSizeInBits(); 8231 unsigned AndBitWidth = And.getValueSizeInBits(); 8232 if (BitWidth > AndBitWidth) { 8233 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 8234 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 8235 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 8236 return SDValue(); 8237 } 8238 LHS = Op1; 8239 RHS = Op0.getOperand(1); 8240 } 8241 } else if (Op1.getOpcode() == ISD::Constant) { 8242 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 8243 uint64_t AndRHSVal = AndRHS->getZExtValue(); 8244 SDValue AndLHS = Op0; 8245 8246 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 8247 LHS = AndLHS.getOperand(0); 8248 RHS = AndLHS.getOperand(1); 8249 } 8250 8251 // Use BT if the immediate can't be encoded in a TEST instruction. 8252 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 8253 LHS = AndLHS; 8254 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); 8255 } 8256 } 8257 8258 if (LHS.getNode()) { 8259 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 8260 // instruction. Since the shift amount is in-range-or-undefined, we know 8261 // that doing a bittest on the i32 value is ok. We extend to i32 because 8262 // the encoding for the i16 version is larger than the i32 version. 8263 // Also promote i16 to i32 for performance / code size reason. 8264 if (LHS.getValueType() == MVT::i8 || 8265 LHS.getValueType() == MVT::i16) 8266 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 8267 8268 // If the operand types disagree, extend the shift amount to match. Since 8269 // BT ignores high bits (like shifts) we can use anyextend. 8270 if (LHS.getValueType() != RHS.getValueType()) 8271 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 8272 8273 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 8274 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 8275 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8276 DAG.getConstant(Cond, MVT::i8), BT); 8277 } 8278 8279 return SDValue(); 8280} 8281 8282SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 8283 8284 if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); 8285 8286 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 8287 SDValue Op0 = Op.getOperand(0); 8288 SDValue Op1 = Op.getOperand(1); 8289 DebugLoc dl = Op.getDebugLoc(); 8290 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 8291 8292 // Optimize to BT if possible. 8293 // Lower (X & (1 << N)) == 0 to BT(X, N). 8294 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 8295 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 8296 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 8297 Op1.getOpcode() == ISD::Constant && 8298 cast<ConstantSDNode>(Op1)->isNullValue() && 8299 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8300 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 8301 if (NewSetCC.getNode()) 8302 return NewSetCC; 8303 } 8304 8305 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 8306 // these. 8307 if (Op1.getOpcode() == ISD::Constant && 8308 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 8309 cast<ConstantSDNode>(Op1)->isNullValue()) && 8310 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8311 8312 // If the input is a setcc, then reuse the input setcc or use a new one with 8313 // the inverted condition. 8314 if (Op0.getOpcode() == X86ISD::SETCC) { 8315 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 8316 bool Invert = (CC == ISD::SETNE) ^ 8317 cast<ConstantSDNode>(Op1)->isNullValue(); 8318 if (!Invert) return Op0; 8319 8320 CCode = X86::GetOppositeBranchCondition(CCode); 8321 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8322 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 8323 } 8324 } 8325 8326 bool isFP = Op1.getValueType().isFloatingPoint(); 8327 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 8328 if (X86CC == X86::COND_INVALID) 8329 return SDValue(); 8330 8331 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 8332 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8333 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 8334} 8335 8336// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 8337// ones, and then concatenate the result back. 8338static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 8339 EVT VT = Op.getValueType(); 8340 8341 assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC && 8342 "Unsupported value type for operation"); 8343 8344 int NumElems = VT.getVectorNumElements(); 8345 DebugLoc dl = Op.getDebugLoc(); 8346 SDValue CC = Op.getOperand(2); 8347 SDValue Idx0 = DAG.getConstant(0, MVT::i32); 8348 SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); 8349 8350 // Extract the LHS vectors 8351 SDValue LHS = Op.getOperand(0); 8352 SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); 8353 SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); 8354 8355 // Extract the RHS vectors 8356 SDValue RHS = Op.getOperand(1); 8357 SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); 8358 SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); 8359 8360 // Issue the operation on the smaller types and concatenate the result back 8361 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 8362 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 8363 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 8364 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 8365 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 8366} 8367 8368 8369SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 8370 SDValue Cond; 8371 SDValue Op0 = Op.getOperand(0); 8372 SDValue Op1 = Op.getOperand(1); 8373 SDValue CC = Op.getOperand(2); 8374 EVT VT = Op.getValueType(); 8375 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 8376 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 8377 DebugLoc dl = Op.getDebugLoc(); 8378 8379 if (isFP) { 8380 unsigned SSECC = 8; 8381 EVT EltVT = Op0.getValueType().getVectorElementType(); 8382 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 8383 8384 unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 8385 bool Swap = false; 8386 8387 // SSE Condition code mapping: 8388 // 0 - EQ 8389 // 1 - LT 8390 // 2 - LE 8391 // 3 - UNORD 8392 // 4 - NEQ 8393 // 5 - NLT 8394 // 6 - NLE 8395 // 7 - ORD 8396 switch (SetCCOpcode) { 8397 default: break; 8398 case ISD::SETOEQ: 8399 case ISD::SETEQ: SSECC = 0; break; 8400 case ISD::SETOGT: 8401 case ISD::SETGT: Swap = true; // Fallthrough 8402 case ISD::SETLT: 8403 case ISD::SETOLT: SSECC = 1; break; 8404 case ISD::SETOGE: 8405 case ISD::SETGE: Swap = true; // Fallthrough 8406 case ISD::SETLE: 8407 case ISD::SETOLE: SSECC = 2; break; 8408 case ISD::SETUO: SSECC = 3; break; 8409 case ISD::SETUNE: 8410 case ISD::SETNE: SSECC = 4; break; 8411 case ISD::SETULE: Swap = true; 8412 case ISD::SETUGE: SSECC = 5; break; 8413 case ISD::SETULT: Swap = true; 8414 case ISD::SETUGT: SSECC = 6; break; 8415 case ISD::SETO: SSECC = 7; break; 8416 } 8417 if (Swap) 8418 std::swap(Op0, Op1); 8419 8420 // In the two special cases we can't handle, emit two comparisons. 8421 if (SSECC == 8) { 8422 if (SetCCOpcode == ISD::SETUEQ) { 8423 SDValue UNORD, EQ; 8424 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 8425 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 8426 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 8427 } else if (SetCCOpcode == ISD::SETONE) { 8428 SDValue ORD, NEQ; 8429 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 8430 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 8431 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 8432 } 8433 llvm_unreachable("Illegal FP comparison"); 8434 } 8435 // Handle all other FP comparisons here. 8436 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 8437 } 8438 8439 // Break 256-bit integer vector compare into smaller ones. 8440 if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) 8441 return Lower256IntVSETCC(Op, DAG); 8442 8443 // We are handling one of the integer comparisons here. Since SSE only has 8444 // GT and EQ comparisons for integer, swapping operands and multiple 8445 // operations may be required for some comparisons. 8446 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 8447 bool Swap = false, Invert = false, FlipSigns = false; 8448 8449 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 8450 default: break; 8451 case MVT::i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 8452 case MVT::i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 8453 case MVT::i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 8454 case MVT::i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 8455 } 8456 8457 switch (SetCCOpcode) { 8458 default: break; 8459 case ISD::SETNE: Invert = true; 8460 case ISD::SETEQ: Opc = EQOpc; break; 8461 case ISD::SETLT: Swap = true; 8462 case ISD::SETGT: Opc = GTOpc; break; 8463 case ISD::SETGE: Swap = true; 8464 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 8465 case ISD::SETULT: Swap = true; 8466 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 8467 case ISD::SETUGE: Swap = true; 8468 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 8469 } 8470 if (Swap) 8471 std::swap(Op0, Op1); 8472 8473 // Check that the operation in question is available (most are plain SSE2, 8474 // but PCMPGTQ and PCMPEQQ have different requirements). 8475 if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42orAVX()) 8476 return SDValue(); 8477 if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41orAVX()) 8478 return SDValue(); 8479 8480 // Since SSE has no unsigned integer comparisons, we need to flip the sign 8481 // bits of the inputs before performing those operations. 8482 if (FlipSigns) { 8483 EVT EltVT = VT.getVectorElementType(); 8484 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 8485 EltVT); 8486 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 8487 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 8488 SignBits.size()); 8489 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 8490 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 8491 } 8492 8493 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 8494 8495 // If the logical-not of the result is required, perform that now. 8496 if (Invert) 8497 Result = DAG.getNOT(dl, Result, VT); 8498 8499 return Result; 8500} 8501 8502// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 8503static bool isX86LogicalCmp(SDValue Op) { 8504 unsigned Opc = Op.getNode()->getOpcode(); 8505 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 8506 return true; 8507 if (Op.getResNo() == 1 && 8508 (Opc == X86ISD::ADD || 8509 Opc == X86ISD::SUB || 8510 Opc == X86ISD::ADC || 8511 Opc == X86ISD::SBB || 8512 Opc == X86ISD::SMUL || 8513 Opc == X86ISD::UMUL || 8514 Opc == X86ISD::INC || 8515 Opc == X86ISD::DEC || 8516 Opc == X86ISD::OR || 8517 Opc == X86ISD::XOR || 8518 Opc == X86ISD::AND)) 8519 return true; 8520 8521 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 8522 return true; 8523 8524 return false; 8525} 8526 8527static bool isZero(SDValue V) { 8528 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8529 return C && C->isNullValue(); 8530} 8531 8532static bool isAllOnes(SDValue V) { 8533 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 8534 return C && C->isAllOnesValue(); 8535} 8536 8537SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 8538 bool addTest = true; 8539 SDValue Cond = Op.getOperand(0); 8540 SDValue Op1 = Op.getOperand(1); 8541 SDValue Op2 = Op.getOperand(2); 8542 DebugLoc DL = Op.getDebugLoc(); 8543 SDValue CC; 8544 8545 if (Cond.getOpcode() == ISD::SETCC) { 8546 SDValue NewCond = LowerSETCC(Cond, DAG); 8547 if (NewCond.getNode()) 8548 Cond = NewCond; 8549 } 8550 8551 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 8552 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 8553 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 8554 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 8555 if (Cond.getOpcode() == X86ISD::SETCC && 8556 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 8557 isZero(Cond.getOperand(1).getOperand(1))) { 8558 SDValue Cmp = Cond.getOperand(1); 8559 8560 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 8561 8562 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 8563 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 8564 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 8565 8566 SDValue CmpOp0 = Cmp.getOperand(0); 8567 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 8568 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 8569 8570 SDValue Res = // Res = 0 or -1. 8571 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8572 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 8573 8574 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 8575 Res = DAG.getNOT(DL, Res, Res.getValueType()); 8576 8577 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 8578 if (N2C == 0 || !N2C->isNullValue()) 8579 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 8580 return Res; 8581 } 8582 } 8583 8584 // Look past (and (setcc_carry (cmp ...)), 1). 8585 if (Cond.getOpcode() == ISD::AND && 8586 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8587 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8588 if (C && C->getAPIntValue() == 1) 8589 Cond = Cond.getOperand(0); 8590 } 8591 8592 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8593 // setting operand in place of the X86ISD::SETCC. 8594 unsigned CondOpcode = Cond.getOpcode(); 8595 if (CondOpcode == X86ISD::SETCC || 8596 CondOpcode == X86ISD::SETCC_CARRY) { 8597 CC = Cond.getOperand(0); 8598 8599 SDValue Cmp = Cond.getOperand(1); 8600 unsigned Opc = Cmp.getOpcode(); 8601 EVT VT = Op.getValueType(); 8602 8603 bool IllegalFPCMov = false; 8604 if (VT.isFloatingPoint() && !VT.isVector() && 8605 !isScalarFPTypeInSSEReg(VT)) // FPStack? 8606 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 8607 8608 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 8609 Opc == X86ISD::BT) { // FIXME 8610 Cond = Cmp; 8611 addTest = false; 8612 } 8613 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 8614 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 8615 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 8616 Cond.getOperand(0).getValueType() != MVT::i8)) { 8617 SDValue LHS = Cond.getOperand(0); 8618 SDValue RHS = Cond.getOperand(1); 8619 unsigned X86Opcode; 8620 unsigned X86Cond; 8621 SDVTList VTs; 8622 switch (CondOpcode) { 8623 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 8624 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 8625 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 8626 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 8627 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 8628 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 8629 default: llvm_unreachable("unexpected overflowing operator"); 8630 } 8631 if (CondOpcode == ISD::UMULO) 8632 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 8633 MVT::i32); 8634 else 8635 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 8636 8637 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 8638 8639 if (CondOpcode == ISD::UMULO) 8640 Cond = X86Op.getValue(2); 8641 else 8642 Cond = X86Op.getValue(1); 8643 8644 CC = DAG.getConstant(X86Cond, MVT::i8); 8645 addTest = false; 8646 } 8647 8648 if (addTest) { 8649 // Look pass the truncate. 8650 if (Cond.getOpcode() == ISD::TRUNCATE) 8651 Cond = Cond.getOperand(0); 8652 8653 // We know the result of AND is compared against zero. Try to match 8654 // it to BT. 8655 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8656 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 8657 if (NewSetCC.getNode()) { 8658 CC = NewSetCC.getOperand(0); 8659 Cond = NewSetCC.getOperand(1); 8660 addTest = false; 8661 } 8662 } 8663 } 8664 8665 if (addTest) { 8666 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8667 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8668 } 8669 8670 // a < b ? -1 : 0 -> RES = ~setcc_carry 8671 // a < b ? 0 : -1 -> RES = setcc_carry 8672 // a >= b ? -1 : 0 -> RES = setcc_carry 8673 // a >= b ? 0 : -1 -> RES = ~setcc_carry 8674 if (Cond.getOpcode() == X86ISD::CMP) { 8675 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 8676 8677 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 8678 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 8679 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8680 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 8681 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 8682 return DAG.getNOT(DL, Res, Res.getValueType()); 8683 return Res; 8684 } 8685 } 8686 8687 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 8688 // condition is true. 8689 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 8690 SDValue Ops[] = { Op2, Op1, CC, Cond }; 8691 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 8692} 8693 8694// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 8695// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 8696// from the AND / OR. 8697static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 8698 Opc = Op.getOpcode(); 8699 if (Opc != ISD::OR && Opc != ISD::AND) 8700 return false; 8701 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8702 Op.getOperand(0).hasOneUse() && 8703 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 8704 Op.getOperand(1).hasOneUse()); 8705} 8706 8707// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 8708// 1 and that the SETCC node has a single use. 8709static bool isXor1OfSetCC(SDValue Op) { 8710 if (Op.getOpcode() != ISD::XOR) 8711 return false; 8712 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8713 if (N1C && N1C->getAPIntValue() == 1) { 8714 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8715 Op.getOperand(0).hasOneUse(); 8716 } 8717 return false; 8718} 8719 8720SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 8721 bool addTest = true; 8722 SDValue Chain = Op.getOperand(0); 8723 SDValue Cond = Op.getOperand(1); 8724 SDValue Dest = Op.getOperand(2); 8725 DebugLoc dl = Op.getDebugLoc(); 8726 SDValue CC; 8727 bool Inverted = false; 8728 8729 if (Cond.getOpcode() == ISD::SETCC) { 8730 // Check for setcc([su]{add,sub,mul}o == 0). 8731 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 8732 isa<ConstantSDNode>(Cond.getOperand(1)) && 8733 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && 8734 Cond.getOperand(0).getResNo() == 1 && 8735 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 8736 Cond.getOperand(0).getOpcode() == ISD::UADDO || 8737 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 8738 Cond.getOperand(0).getOpcode() == ISD::USUBO || 8739 Cond.getOperand(0).getOpcode() == ISD::SMULO || 8740 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 8741 Inverted = true; 8742 Cond = Cond.getOperand(0); 8743 } else { 8744 SDValue NewCond = LowerSETCC(Cond, DAG); 8745 if (NewCond.getNode()) 8746 Cond = NewCond; 8747 } 8748 } 8749#if 0 8750 // FIXME: LowerXALUO doesn't handle these!! 8751 else if (Cond.getOpcode() == X86ISD::ADD || 8752 Cond.getOpcode() == X86ISD::SUB || 8753 Cond.getOpcode() == X86ISD::SMUL || 8754 Cond.getOpcode() == X86ISD::UMUL) 8755 Cond = LowerXALUO(Cond, DAG); 8756#endif 8757 8758 // Look pass (and (setcc_carry (cmp ...)), 1). 8759 if (Cond.getOpcode() == ISD::AND && 8760 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8761 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8762 if (C && C->getAPIntValue() == 1) 8763 Cond = Cond.getOperand(0); 8764 } 8765 8766 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8767 // setting operand in place of the X86ISD::SETCC. 8768 unsigned CondOpcode = Cond.getOpcode(); 8769 if (CondOpcode == X86ISD::SETCC || 8770 CondOpcode == X86ISD::SETCC_CARRY) { 8771 CC = Cond.getOperand(0); 8772 8773 SDValue Cmp = Cond.getOperand(1); 8774 unsigned Opc = Cmp.getOpcode(); 8775 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 8776 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 8777 Cond = Cmp; 8778 addTest = false; 8779 } else { 8780 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 8781 default: break; 8782 case X86::COND_O: 8783 case X86::COND_B: 8784 // These can only come from an arithmetic instruction with overflow, 8785 // e.g. SADDO, UADDO. 8786 Cond = Cond.getNode()->getOperand(1); 8787 addTest = false; 8788 break; 8789 } 8790 } 8791 } 8792 CondOpcode = Cond.getOpcode(); 8793 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 8794 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 8795 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 8796 Cond.getOperand(0).getValueType() != MVT::i8)) { 8797 SDValue LHS = Cond.getOperand(0); 8798 SDValue RHS = Cond.getOperand(1); 8799 unsigned X86Opcode; 8800 unsigned X86Cond; 8801 SDVTList VTs; 8802 switch (CondOpcode) { 8803 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 8804 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 8805 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 8806 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 8807 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 8808 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 8809 default: llvm_unreachable("unexpected overflowing operator"); 8810 } 8811 if (Inverted) 8812 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 8813 if (CondOpcode == ISD::UMULO) 8814 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 8815 MVT::i32); 8816 else 8817 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 8818 8819 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 8820 8821 if (CondOpcode == ISD::UMULO) 8822 Cond = X86Op.getValue(2); 8823 else 8824 Cond = X86Op.getValue(1); 8825 8826 CC = DAG.getConstant(X86Cond, MVT::i8); 8827 addTest = false; 8828 } else { 8829 unsigned CondOpc; 8830 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 8831 SDValue Cmp = Cond.getOperand(0).getOperand(1); 8832 if (CondOpc == ISD::OR) { 8833 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 8834 // two branches instead of an explicit OR instruction with a 8835 // separate test. 8836 if (Cmp == Cond.getOperand(1).getOperand(1) && 8837 isX86LogicalCmp(Cmp)) { 8838 CC = Cond.getOperand(0).getOperand(0); 8839 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8840 Chain, Dest, CC, Cmp); 8841 CC = Cond.getOperand(1).getOperand(0); 8842 Cond = Cmp; 8843 addTest = false; 8844 } 8845 } else { // ISD::AND 8846 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 8847 // two branches instead of an explicit AND instruction with a 8848 // separate test. However, we only do this if this block doesn't 8849 // have a fall-through edge, because this requires an explicit 8850 // jmp when the condition is false. 8851 if (Cmp == Cond.getOperand(1).getOperand(1) && 8852 isX86LogicalCmp(Cmp) && 8853 Op.getNode()->hasOneUse()) { 8854 X86::CondCode CCode = 8855 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8856 CCode = X86::GetOppositeBranchCondition(CCode); 8857 CC = DAG.getConstant(CCode, MVT::i8); 8858 SDNode *User = *Op.getNode()->use_begin(); 8859 // Look for an unconditional branch following this conditional branch. 8860 // We need this because we need to reverse the successors in order 8861 // to implement FCMP_OEQ. 8862 if (User->getOpcode() == ISD::BR) { 8863 SDValue FalseBB = User->getOperand(1); 8864 SDNode *NewBR = 8865 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8866 assert(NewBR == User); 8867 (void)NewBR; 8868 Dest = FalseBB; 8869 8870 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8871 Chain, Dest, CC, Cmp); 8872 X86::CondCode CCode = 8873 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 8874 CCode = X86::GetOppositeBranchCondition(CCode); 8875 CC = DAG.getConstant(CCode, MVT::i8); 8876 Cond = Cmp; 8877 addTest = false; 8878 } 8879 } 8880 } 8881 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 8882 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 8883 // It should be transformed during dag combiner except when the condition 8884 // is set by a arithmetics with overflow node. 8885 X86::CondCode CCode = 8886 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8887 CCode = X86::GetOppositeBranchCondition(CCode); 8888 CC = DAG.getConstant(CCode, MVT::i8); 8889 Cond = Cond.getOperand(0).getOperand(1); 8890 addTest = false; 8891 } else if (Cond.getOpcode() == ISD::SETCC && 8892 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 8893 // For FCMP_OEQ, we can emit 8894 // two branches instead of an explicit AND instruction with a 8895 // separate test. However, we only do this if this block doesn't 8896 // have a fall-through edge, because this requires an explicit 8897 // jmp when the condition is false. 8898 if (Op.getNode()->hasOneUse()) { 8899 SDNode *User = *Op.getNode()->use_begin(); 8900 // Look for an unconditional branch following this conditional branch. 8901 // We need this because we need to reverse the successors in order 8902 // to implement FCMP_OEQ. 8903 if (User->getOpcode() == ISD::BR) { 8904 SDValue FalseBB = User->getOperand(1); 8905 SDNode *NewBR = 8906 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8907 assert(NewBR == User); 8908 (void)NewBR; 8909 Dest = FalseBB; 8910 8911 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 8912 Cond.getOperand(0), Cond.getOperand(1)); 8913 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8914 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8915 Chain, Dest, CC, Cmp); 8916 CC = DAG.getConstant(X86::COND_P, MVT::i8); 8917 Cond = Cmp; 8918 addTest = false; 8919 } 8920 } 8921 } else if (Cond.getOpcode() == ISD::SETCC && 8922 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 8923 // For FCMP_UNE, we can emit 8924 // two branches instead of an explicit AND instruction with a 8925 // separate test. However, we only do this if this block doesn't 8926 // have a fall-through edge, because this requires an explicit 8927 // jmp when the condition is false. 8928 if (Op.getNode()->hasOneUse()) { 8929 SDNode *User = *Op.getNode()->use_begin(); 8930 // Look for an unconditional branch following this conditional branch. 8931 // We need this because we need to reverse the successors in order 8932 // to implement FCMP_UNE. 8933 if (User->getOpcode() == ISD::BR) { 8934 SDValue FalseBB = User->getOperand(1); 8935 SDNode *NewBR = 8936 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8937 assert(NewBR == User); 8938 (void)NewBR; 8939 8940 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 8941 Cond.getOperand(0), Cond.getOperand(1)); 8942 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8943 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8944 Chain, Dest, CC, Cmp); 8945 CC = DAG.getConstant(X86::COND_NP, MVT::i8); 8946 Cond = Cmp; 8947 addTest = false; 8948 Dest = FalseBB; 8949 } 8950 } 8951 } 8952 } 8953 8954 if (addTest) { 8955 // Look pass the truncate. 8956 if (Cond.getOpcode() == ISD::TRUNCATE) 8957 Cond = Cond.getOperand(0); 8958 8959 // We know the result of AND is compared against zero. Try to match 8960 // it to BT. 8961 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8962 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 8963 if (NewSetCC.getNode()) { 8964 CC = NewSetCC.getOperand(0); 8965 Cond = NewSetCC.getOperand(1); 8966 addTest = false; 8967 } 8968 } 8969 } 8970 8971 if (addTest) { 8972 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8973 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8974 } 8975 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8976 Chain, Dest, CC, Cond); 8977} 8978 8979 8980// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 8981// Calls to _alloca is needed to probe the stack when allocating more than 4k 8982// bytes in one go. Touching the stack at 4K increments is necessary to ensure 8983// that the guard pages used by the OS virtual memory manager are allocated in 8984// correct sequence. 8985SDValue 8986X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 8987 SelectionDAG &DAG) const { 8988 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || 8989 getTargetMachine().Options.EnableSegmentedStacks) && 8990 "This should be used only on Windows targets or when segmented stacks " 8991 "are being used"); 8992 assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); 8993 DebugLoc dl = Op.getDebugLoc(); 8994 8995 // Get the inputs. 8996 SDValue Chain = Op.getOperand(0); 8997 SDValue Size = Op.getOperand(1); 8998 // FIXME: Ensure alignment here 8999 9000 bool Is64Bit = Subtarget->is64Bit(); 9001 EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; 9002 9003 if (getTargetMachine().Options.EnableSegmentedStacks) { 9004 MachineFunction &MF = DAG.getMachineFunction(); 9005 MachineRegisterInfo &MRI = MF.getRegInfo(); 9006 9007 if (Is64Bit) { 9008 // The 64 bit implementation of segmented stacks needs to clobber both r10 9009 // r11. This makes it impossible to use it along with nested parameters. 9010 const Function *F = MF.getFunction(); 9011 9012 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 9013 I != E; I++) 9014 if (I->hasNestAttr()) 9015 report_fatal_error("Cannot use segmented stacks with functions that " 9016 "have nested arguments."); 9017 } 9018 9019 const TargetRegisterClass *AddrRegClass = 9020 getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); 9021 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 9022 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 9023 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 9024 DAG.getRegister(Vreg, SPTy)); 9025 SDValue Ops1[2] = { Value, Chain }; 9026 return DAG.getMergeValues(Ops1, 2, dl); 9027 } else { 9028 SDValue Flag; 9029 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 9030 9031 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 9032 Flag = Chain.getValue(1); 9033 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 9034 9035 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 9036 Flag = Chain.getValue(1); 9037 9038 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 9039 9040 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 9041 return DAG.getMergeValues(Ops1, 2, dl); 9042 } 9043} 9044 9045SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 9046 MachineFunction &MF = DAG.getMachineFunction(); 9047 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 9048 9049 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9050 DebugLoc DL = Op.getDebugLoc(); 9051 9052 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 9053 // vastart just stores the address of the VarArgsFrameIndex slot into the 9054 // memory location argument. 9055 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 9056 getPointerTy()); 9057 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 9058 MachinePointerInfo(SV), false, false, 0); 9059 } 9060 9061 // __va_list_tag: 9062 // gp_offset (0 - 6 * 8) 9063 // fp_offset (48 - 48 + 8 * 16) 9064 // overflow_arg_area (point to parameters coming in memory). 9065 // reg_save_area 9066 SmallVector<SDValue, 8> MemOps; 9067 SDValue FIN = Op.getOperand(1); 9068 // Store gp_offset 9069 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 9070 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 9071 MVT::i32), 9072 FIN, MachinePointerInfo(SV), false, false, 0); 9073 MemOps.push_back(Store); 9074 9075 // Store fp_offset 9076 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9077 FIN, DAG.getIntPtrConstant(4)); 9078 Store = DAG.getStore(Op.getOperand(0), DL, 9079 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 9080 MVT::i32), 9081 FIN, MachinePointerInfo(SV, 4), false, false, 0); 9082 MemOps.push_back(Store); 9083 9084 // Store ptr to overflow_arg_area 9085 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9086 FIN, DAG.getIntPtrConstant(4)); 9087 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 9088 getPointerTy()); 9089 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 9090 MachinePointerInfo(SV, 8), 9091 false, false, 0); 9092 MemOps.push_back(Store); 9093 9094 // Store ptr to reg_save_area. 9095 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 9096 FIN, DAG.getIntPtrConstant(8)); 9097 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 9098 getPointerTy()); 9099 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 9100 MachinePointerInfo(SV, 16), false, false, 0); 9101 MemOps.push_back(Store); 9102 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 9103 &MemOps[0], MemOps.size()); 9104} 9105 9106SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 9107 assert(Subtarget->is64Bit() && 9108 "LowerVAARG only handles 64-bit va_arg!"); 9109 assert((Subtarget->isTargetLinux() || 9110 Subtarget->isTargetDarwin()) && 9111 "Unhandled target in LowerVAARG"); 9112 assert(Op.getNode()->getNumOperands() == 4); 9113 SDValue Chain = Op.getOperand(0); 9114 SDValue SrcPtr = Op.getOperand(1); 9115 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9116 unsigned Align = Op.getConstantOperandVal(3); 9117 DebugLoc dl = Op.getDebugLoc(); 9118 9119 EVT ArgVT = Op.getNode()->getValueType(0); 9120 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 9121 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 9122 uint8_t ArgMode; 9123 9124 // Decide which area this value should be read from. 9125 // TODO: Implement the AMD64 ABI in its entirety. This simple 9126 // selection mechanism works only for the basic types. 9127 if (ArgVT == MVT::f80) { 9128 llvm_unreachable("va_arg for f80 not yet implemented"); 9129 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 9130 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 9131 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 9132 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 9133 } else { 9134 llvm_unreachable("Unhandled argument type in LowerVAARG"); 9135 } 9136 9137 if (ArgMode == 2) { 9138 // Sanity Check: Make sure using fp_offset makes sense. 9139 assert(!getTargetMachine().Options.UseSoftFloat && 9140 !(DAG.getMachineFunction() 9141 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 9142 Subtarget->hasXMM()); 9143 } 9144 9145 // Insert VAARG_64 node into the DAG 9146 // VAARG_64 returns two values: Variable Argument Address, Chain 9147 SmallVector<SDValue, 11> InstOps; 9148 InstOps.push_back(Chain); 9149 InstOps.push_back(SrcPtr); 9150 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 9151 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 9152 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 9153 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 9154 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 9155 VTs, &InstOps[0], InstOps.size(), 9156 MVT::i64, 9157 MachinePointerInfo(SV), 9158 /*Align=*/0, 9159 /*Volatile=*/false, 9160 /*ReadMem=*/true, 9161 /*WriteMem=*/true); 9162 Chain = VAARG.getValue(1); 9163 9164 // Load the next argument and return it 9165 return DAG.getLoad(ArgVT, dl, 9166 Chain, 9167 VAARG, 9168 MachinePointerInfo(), 9169 false, false, false, 0); 9170} 9171 9172SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 9173 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 9174 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 9175 SDValue Chain = Op.getOperand(0); 9176 SDValue DstPtr = Op.getOperand(1); 9177 SDValue SrcPtr = Op.getOperand(2); 9178 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 9179 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9180 DebugLoc DL = Op.getDebugLoc(); 9181 9182 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 9183 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 9184 false, 9185 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 9186} 9187 9188SDValue 9189X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 9190 DebugLoc dl = Op.getDebugLoc(); 9191 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9192 switch (IntNo) { 9193 default: return SDValue(); // Don't custom lower most intrinsics. 9194 // Comparison intrinsics. 9195 case Intrinsic::x86_sse_comieq_ss: 9196 case Intrinsic::x86_sse_comilt_ss: 9197 case Intrinsic::x86_sse_comile_ss: 9198 case Intrinsic::x86_sse_comigt_ss: 9199 case Intrinsic::x86_sse_comige_ss: 9200 case Intrinsic::x86_sse_comineq_ss: 9201 case Intrinsic::x86_sse_ucomieq_ss: 9202 case Intrinsic::x86_sse_ucomilt_ss: 9203 case Intrinsic::x86_sse_ucomile_ss: 9204 case Intrinsic::x86_sse_ucomigt_ss: 9205 case Intrinsic::x86_sse_ucomige_ss: 9206 case Intrinsic::x86_sse_ucomineq_ss: 9207 case Intrinsic::x86_sse2_comieq_sd: 9208 case Intrinsic::x86_sse2_comilt_sd: 9209 case Intrinsic::x86_sse2_comile_sd: 9210 case Intrinsic::x86_sse2_comigt_sd: 9211 case Intrinsic::x86_sse2_comige_sd: 9212 case Intrinsic::x86_sse2_comineq_sd: 9213 case Intrinsic::x86_sse2_ucomieq_sd: 9214 case Intrinsic::x86_sse2_ucomilt_sd: 9215 case Intrinsic::x86_sse2_ucomile_sd: 9216 case Intrinsic::x86_sse2_ucomigt_sd: 9217 case Intrinsic::x86_sse2_ucomige_sd: 9218 case Intrinsic::x86_sse2_ucomineq_sd: { 9219 unsigned Opc = 0; 9220 ISD::CondCode CC = ISD::SETCC_INVALID; 9221 switch (IntNo) { 9222 default: break; 9223 case Intrinsic::x86_sse_comieq_ss: 9224 case Intrinsic::x86_sse2_comieq_sd: 9225 Opc = X86ISD::COMI; 9226 CC = ISD::SETEQ; 9227 break; 9228 case Intrinsic::x86_sse_comilt_ss: 9229 case Intrinsic::x86_sse2_comilt_sd: 9230 Opc = X86ISD::COMI; 9231 CC = ISD::SETLT; 9232 break; 9233 case Intrinsic::x86_sse_comile_ss: 9234 case Intrinsic::x86_sse2_comile_sd: 9235 Opc = X86ISD::COMI; 9236 CC = ISD::SETLE; 9237 break; 9238 case Intrinsic::x86_sse_comigt_ss: 9239 case Intrinsic::x86_sse2_comigt_sd: 9240 Opc = X86ISD::COMI; 9241 CC = ISD::SETGT; 9242 break; 9243 case Intrinsic::x86_sse_comige_ss: 9244 case Intrinsic::x86_sse2_comige_sd: 9245 Opc = X86ISD::COMI; 9246 CC = ISD::SETGE; 9247 break; 9248 case Intrinsic::x86_sse_comineq_ss: 9249 case Intrinsic::x86_sse2_comineq_sd: 9250 Opc = X86ISD::COMI; 9251 CC = ISD::SETNE; 9252 break; 9253 case Intrinsic::x86_sse_ucomieq_ss: 9254 case Intrinsic::x86_sse2_ucomieq_sd: 9255 Opc = X86ISD::UCOMI; 9256 CC = ISD::SETEQ; 9257 break; 9258 case Intrinsic::x86_sse_ucomilt_ss: 9259 case Intrinsic::x86_sse2_ucomilt_sd: 9260 Opc = X86ISD::UCOMI; 9261 CC = ISD::SETLT; 9262 break; 9263 case Intrinsic::x86_sse_ucomile_ss: 9264 case Intrinsic::x86_sse2_ucomile_sd: 9265 Opc = X86ISD::UCOMI; 9266 CC = ISD::SETLE; 9267 break; 9268 case Intrinsic::x86_sse_ucomigt_ss: 9269 case Intrinsic::x86_sse2_ucomigt_sd: 9270 Opc = X86ISD::UCOMI; 9271 CC = ISD::SETGT; 9272 break; 9273 case Intrinsic::x86_sse_ucomige_ss: 9274 case Intrinsic::x86_sse2_ucomige_sd: 9275 Opc = X86ISD::UCOMI; 9276 CC = ISD::SETGE; 9277 break; 9278 case Intrinsic::x86_sse_ucomineq_ss: 9279 case Intrinsic::x86_sse2_ucomineq_sd: 9280 Opc = X86ISD::UCOMI; 9281 CC = ISD::SETNE; 9282 break; 9283 } 9284 9285 SDValue LHS = Op.getOperand(1); 9286 SDValue RHS = Op.getOperand(2); 9287 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 9288 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 9289 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 9290 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 9291 DAG.getConstant(X86CC, MVT::i8), Cond); 9292 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 9293 } 9294 // Arithmetic intrinsics. 9295 case Intrinsic::x86_sse3_hadd_ps: 9296 case Intrinsic::x86_sse3_hadd_pd: 9297 case Intrinsic::x86_avx_hadd_ps_256: 9298 case Intrinsic::x86_avx_hadd_pd_256: 9299 return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(), 9300 Op.getOperand(1), Op.getOperand(2)); 9301 case Intrinsic::x86_sse3_hsub_ps: 9302 case Intrinsic::x86_sse3_hsub_pd: 9303 case Intrinsic::x86_avx_hsub_ps_256: 9304 case Intrinsic::x86_avx_hsub_pd_256: 9305 return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(), 9306 Op.getOperand(1), Op.getOperand(2)); 9307 case Intrinsic::x86_avx2_psllv_d: 9308 case Intrinsic::x86_avx2_psllv_q: 9309 case Intrinsic::x86_avx2_psllv_d_256: 9310 case Intrinsic::x86_avx2_psllv_q_256: 9311 return DAG.getNode(ISD::SHL, dl, Op.getValueType(), 9312 Op.getOperand(1), Op.getOperand(2)); 9313 case Intrinsic::x86_avx2_psrlv_d: 9314 case Intrinsic::x86_avx2_psrlv_q: 9315 case Intrinsic::x86_avx2_psrlv_d_256: 9316 case Intrinsic::x86_avx2_psrlv_q_256: 9317 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), 9318 Op.getOperand(1), Op.getOperand(2)); 9319 case Intrinsic::x86_avx2_psrav_d: 9320 case Intrinsic::x86_avx2_psrav_d_256: 9321 return DAG.getNode(ISD::SRA, dl, Op.getValueType(), 9322 Op.getOperand(1), Op.getOperand(2)); 9323 9324 // ptest and testp intrinsics. The intrinsic these come from are designed to 9325 // return an integer value, not just an instruction so lower it to the ptest 9326 // or testp pattern and a setcc for the result. 9327 case Intrinsic::x86_sse41_ptestz: 9328 case Intrinsic::x86_sse41_ptestc: 9329 case Intrinsic::x86_sse41_ptestnzc: 9330 case Intrinsic::x86_avx_ptestz_256: 9331 case Intrinsic::x86_avx_ptestc_256: 9332 case Intrinsic::x86_avx_ptestnzc_256: 9333 case Intrinsic::x86_avx_vtestz_ps: 9334 case Intrinsic::x86_avx_vtestc_ps: 9335 case Intrinsic::x86_avx_vtestnzc_ps: 9336 case Intrinsic::x86_avx_vtestz_pd: 9337 case Intrinsic::x86_avx_vtestc_pd: 9338 case Intrinsic::x86_avx_vtestnzc_pd: 9339 case Intrinsic::x86_avx_vtestz_ps_256: 9340 case Intrinsic::x86_avx_vtestc_ps_256: 9341 case Intrinsic::x86_avx_vtestnzc_ps_256: 9342 case Intrinsic::x86_avx_vtestz_pd_256: 9343 case Intrinsic::x86_avx_vtestc_pd_256: 9344 case Intrinsic::x86_avx_vtestnzc_pd_256: { 9345 bool IsTestPacked = false; 9346 unsigned X86CC = 0; 9347 switch (IntNo) { 9348 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 9349 case Intrinsic::x86_avx_vtestz_ps: 9350 case Intrinsic::x86_avx_vtestz_pd: 9351 case Intrinsic::x86_avx_vtestz_ps_256: 9352 case Intrinsic::x86_avx_vtestz_pd_256: 9353 IsTestPacked = true; // Fallthrough 9354 case Intrinsic::x86_sse41_ptestz: 9355 case Intrinsic::x86_avx_ptestz_256: 9356 // ZF = 1 9357 X86CC = X86::COND_E; 9358 break; 9359 case Intrinsic::x86_avx_vtestc_ps: 9360 case Intrinsic::x86_avx_vtestc_pd: 9361 case Intrinsic::x86_avx_vtestc_ps_256: 9362 case Intrinsic::x86_avx_vtestc_pd_256: 9363 IsTestPacked = true; // Fallthrough 9364 case Intrinsic::x86_sse41_ptestc: 9365 case Intrinsic::x86_avx_ptestc_256: 9366 // CF = 1 9367 X86CC = X86::COND_B; 9368 break; 9369 case Intrinsic::x86_avx_vtestnzc_ps: 9370 case Intrinsic::x86_avx_vtestnzc_pd: 9371 case Intrinsic::x86_avx_vtestnzc_ps_256: 9372 case Intrinsic::x86_avx_vtestnzc_pd_256: 9373 IsTestPacked = true; // Fallthrough 9374 case Intrinsic::x86_sse41_ptestnzc: 9375 case Intrinsic::x86_avx_ptestnzc_256: 9376 // ZF and CF = 0 9377 X86CC = X86::COND_A; 9378 break; 9379 } 9380 9381 SDValue LHS = Op.getOperand(1); 9382 SDValue RHS = Op.getOperand(2); 9383 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 9384 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 9385 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 9386 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 9387 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 9388 } 9389 9390 // Fix vector shift instructions where the last operand is a non-immediate 9391 // i32 value. 9392 case Intrinsic::x86_avx2_pslli_w: 9393 case Intrinsic::x86_avx2_pslli_d: 9394 case Intrinsic::x86_avx2_pslli_q: 9395 case Intrinsic::x86_avx2_psrli_w: 9396 case Intrinsic::x86_avx2_psrli_d: 9397 case Intrinsic::x86_avx2_psrli_q: 9398 case Intrinsic::x86_avx2_psrai_w: 9399 case Intrinsic::x86_avx2_psrai_d: 9400 case Intrinsic::x86_sse2_pslli_w: 9401 case Intrinsic::x86_sse2_pslli_d: 9402 case Intrinsic::x86_sse2_pslli_q: 9403 case Intrinsic::x86_sse2_psrli_w: 9404 case Intrinsic::x86_sse2_psrli_d: 9405 case Intrinsic::x86_sse2_psrli_q: 9406 case Intrinsic::x86_sse2_psrai_w: 9407 case Intrinsic::x86_sse2_psrai_d: 9408 case Intrinsic::x86_mmx_pslli_w: 9409 case Intrinsic::x86_mmx_pslli_d: 9410 case Intrinsic::x86_mmx_pslli_q: 9411 case Intrinsic::x86_mmx_psrli_w: 9412 case Intrinsic::x86_mmx_psrli_d: 9413 case Intrinsic::x86_mmx_psrli_q: 9414 case Intrinsic::x86_mmx_psrai_w: 9415 case Intrinsic::x86_mmx_psrai_d: { 9416 SDValue ShAmt = Op.getOperand(2); 9417 if (isa<ConstantSDNode>(ShAmt)) 9418 return SDValue(); 9419 9420 unsigned NewIntNo = 0; 9421 EVT ShAmtVT = MVT::v4i32; 9422 switch (IntNo) { 9423 case Intrinsic::x86_sse2_pslli_w: 9424 NewIntNo = Intrinsic::x86_sse2_psll_w; 9425 break; 9426 case Intrinsic::x86_sse2_pslli_d: 9427 NewIntNo = Intrinsic::x86_sse2_psll_d; 9428 break; 9429 case Intrinsic::x86_sse2_pslli_q: 9430 NewIntNo = Intrinsic::x86_sse2_psll_q; 9431 break; 9432 case Intrinsic::x86_sse2_psrli_w: 9433 NewIntNo = Intrinsic::x86_sse2_psrl_w; 9434 break; 9435 case Intrinsic::x86_sse2_psrli_d: 9436 NewIntNo = Intrinsic::x86_sse2_psrl_d; 9437 break; 9438 case Intrinsic::x86_sse2_psrli_q: 9439 NewIntNo = Intrinsic::x86_sse2_psrl_q; 9440 break; 9441 case Intrinsic::x86_sse2_psrai_w: 9442 NewIntNo = Intrinsic::x86_sse2_psra_w; 9443 break; 9444 case Intrinsic::x86_sse2_psrai_d: 9445 NewIntNo = Intrinsic::x86_sse2_psra_d; 9446 break; 9447 case Intrinsic::x86_avx2_pslli_w: 9448 NewIntNo = Intrinsic::x86_avx2_psll_w; 9449 break; 9450 case Intrinsic::x86_avx2_pslli_d: 9451 NewIntNo = Intrinsic::x86_avx2_psll_d; 9452 break; 9453 case Intrinsic::x86_avx2_pslli_q: 9454 NewIntNo = Intrinsic::x86_avx2_psll_q; 9455 break; 9456 case Intrinsic::x86_avx2_psrli_w: 9457 NewIntNo = Intrinsic::x86_avx2_psrl_w; 9458 break; 9459 case Intrinsic::x86_avx2_psrli_d: 9460 NewIntNo = Intrinsic::x86_avx2_psrl_d; 9461 break; 9462 case Intrinsic::x86_avx2_psrli_q: 9463 NewIntNo = Intrinsic::x86_avx2_psrl_q; 9464 break; 9465 case Intrinsic::x86_avx2_psrai_w: 9466 NewIntNo = Intrinsic::x86_avx2_psra_w; 9467 break; 9468 case Intrinsic::x86_avx2_psrai_d: 9469 NewIntNo = Intrinsic::x86_avx2_psra_d; 9470 break; 9471 default: { 9472 ShAmtVT = MVT::v2i32; 9473 switch (IntNo) { 9474 case Intrinsic::x86_mmx_pslli_w: 9475 NewIntNo = Intrinsic::x86_mmx_psll_w; 9476 break; 9477 case Intrinsic::x86_mmx_pslli_d: 9478 NewIntNo = Intrinsic::x86_mmx_psll_d; 9479 break; 9480 case Intrinsic::x86_mmx_pslli_q: 9481 NewIntNo = Intrinsic::x86_mmx_psll_q; 9482 break; 9483 case Intrinsic::x86_mmx_psrli_w: 9484 NewIntNo = Intrinsic::x86_mmx_psrl_w; 9485 break; 9486 case Intrinsic::x86_mmx_psrli_d: 9487 NewIntNo = Intrinsic::x86_mmx_psrl_d; 9488 break; 9489 case Intrinsic::x86_mmx_psrli_q: 9490 NewIntNo = Intrinsic::x86_mmx_psrl_q; 9491 break; 9492 case Intrinsic::x86_mmx_psrai_w: 9493 NewIntNo = Intrinsic::x86_mmx_psra_w; 9494 break; 9495 case Intrinsic::x86_mmx_psrai_d: 9496 NewIntNo = Intrinsic::x86_mmx_psra_d; 9497 break; 9498 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 9499 } 9500 break; 9501 } 9502 } 9503 9504 // The vector shift intrinsics with scalars uses 32b shift amounts but 9505 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 9506 // to be zero. 9507 SDValue ShOps[4]; 9508 ShOps[0] = ShAmt; 9509 ShOps[1] = DAG.getConstant(0, MVT::i32); 9510 if (ShAmtVT == MVT::v4i32) { 9511 ShOps[2] = DAG.getUNDEF(MVT::i32); 9512 ShOps[3] = DAG.getUNDEF(MVT::i32); 9513 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 9514 } else { 9515 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 9516// FIXME this must be lowered to get rid of the invalid type. 9517 } 9518 9519 EVT VT = Op.getValueType(); 9520 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 9521 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9522 DAG.getConstant(NewIntNo, MVT::i32), 9523 Op.getOperand(1), ShAmt); 9524 } 9525 } 9526} 9527 9528SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 9529 SelectionDAG &DAG) const { 9530 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 9531 MFI->setReturnAddressIsTaken(true); 9532 9533 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9534 DebugLoc dl = Op.getDebugLoc(); 9535 9536 if (Depth > 0) { 9537 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 9538 SDValue Offset = 9539 DAG.getConstant(TD->getPointerSize(), 9540 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 9541 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 9542 DAG.getNode(ISD::ADD, dl, getPointerTy(), 9543 FrameAddr, Offset), 9544 MachinePointerInfo(), false, false, false, 0); 9545 } 9546 9547 // Just load the return address. 9548 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 9549 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 9550 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 9551} 9552 9553SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 9554 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 9555 MFI->setFrameAddressIsTaken(true); 9556 9557 EVT VT = Op.getValueType(); 9558 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 9559 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 9560 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 9561 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 9562 while (Depth--) 9563 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 9564 MachinePointerInfo(), 9565 false, false, false, 0); 9566 return FrameAddr; 9567} 9568 9569SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 9570 SelectionDAG &DAG) const { 9571 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 9572} 9573 9574SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 9575 MachineFunction &MF = DAG.getMachineFunction(); 9576 SDValue Chain = Op.getOperand(0); 9577 SDValue Offset = Op.getOperand(1); 9578 SDValue Handler = Op.getOperand(2); 9579 DebugLoc dl = Op.getDebugLoc(); 9580 9581 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 9582 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 9583 getPointerTy()); 9584 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 9585 9586 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 9587 DAG.getIntPtrConstant(TD->getPointerSize())); 9588 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 9589 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 9590 false, false, 0); 9591 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 9592 MF.getRegInfo().addLiveOut(StoreAddrReg); 9593 9594 return DAG.getNode(X86ISD::EH_RETURN, dl, 9595 MVT::Other, 9596 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 9597} 9598 9599SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, 9600 SelectionDAG &DAG) const { 9601 return Op.getOperand(0); 9602} 9603 9604SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 9605 SelectionDAG &DAG) const { 9606 SDValue Root = Op.getOperand(0); 9607 SDValue Trmp = Op.getOperand(1); // trampoline 9608 SDValue FPtr = Op.getOperand(2); // nested function 9609 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 9610 DebugLoc dl = Op.getDebugLoc(); 9611 9612 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9613 9614 if (Subtarget->is64Bit()) { 9615 SDValue OutChains[6]; 9616 9617 // Large code-model. 9618 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 9619 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 9620 9621 const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); 9622 const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); 9623 9624 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 9625 9626 // Load the pointer to the nested function into R11. 9627 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 9628 SDValue Addr = Trmp; 9629 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9630 Addr, MachinePointerInfo(TrmpAddr), 9631 false, false, 0); 9632 9633 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9634 DAG.getConstant(2, MVT::i64)); 9635 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 9636 MachinePointerInfo(TrmpAddr, 2), 9637 false, false, 2); 9638 9639 // Load the 'nest' parameter value into R10. 9640 // R10 is specified in X86CallingConv.td 9641 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 9642 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9643 DAG.getConstant(10, MVT::i64)); 9644 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9645 Addr, MachinePointerInfo(TrmpAddr, 10), 9646 false, false, 0); 9647 9648 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9649 DAG.getConstant(12, MVT::i64)); 9650 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 9651 MachinePointerInfo(TrmpAddr, 12), 9652 false, false, 2); 9653 9654 // Jump to the nested function. 9655 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 9656 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9657 DAG.getConstant(20, MVT::i64)); 9658 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 9659 Addr, MachinePointerInfo(TrmpAddr, 20), 9660 false, false, 0); 9661 9662 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 9663 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 9664 DAG.getConstant(22, MVT::i64)); 9665 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 9666 MachinePointerInfo(TrmpAddr, 22), 9667 false, false, 0); 9668 9669 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); 9670 } else { 9671 const Function *Func = 9672 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 9673 CallingConv::ID CC = Func->getCallingConv(); 9674 unsigned NestReg; 9675 9676 switch (CC) { 9677 default: 9678 llvm_unreachable("Unsupported calling convention"); 9679 case CallingConv::C: 9680 case CallingConv::X86_StdCall: { 9681 // Pass 'nest' parameter in ECX. 9682 // Must be kept in sync with X86CallingConv.td 9683 NestReg = X86::ECX; 9684 9685 // Check that ECX wasn't needed by an 'inreg' parameter. 9686 FunctionType *FTy = Func->getFunctionType(); 9687 const AttrListPtr &Attrs = Func->getAttributes(); 9688 9689 if (!Attrs.isEmpty() && !Func->isVarArg()) { 9690 unsigned InRegCount = 0; 9691 unsigned Idx = 1; 9692 9693 for (FunctionType::param_iterator I = FTy->param_begin(), 9694 E = FTy->param_end(); I != E; ++I, ++Idx) 9695 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 9696 // FIXME: should only count parameters that are lowered to integers. 9697 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 9698 9699 if (InRegCount > 2) { 9700 report_fatal_error("Nest register in use - reduce number of inreg" 9701 " parameters!"); 9702 } 9703 } 9704 break; 9705 } 9706 case CallingConv::X86_FastCall: 9707 case CallingConv::X86_ThisCall: 9708 case CallingConv::Fast: 9709 // Pass 'nest' parameter in EAX. 9710 // Must be kept in sync with X86CallingConv.td 9711 NestReg = X86::EAX; 9712 break; 9713 } 9714 9715 SDValue OutChains[4]; 9716 SDValue Addr, Disp; 9717 9718 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9719 DAG.getConstant(10, MVT::i32)); 9720 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 9721 9722 // This is storing the opcode for MOV32ri. 9723 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 9724 const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); 9725 OutChains[0] = DAG.getStore(Root, dl, 9726 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 9727 Trmp, MachinePointerInfo(TrmpAddr), 9728 false, false, 0); 9729 9730 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9731 DAG.getConstant(1, MVT::i32)); 9732 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 9733 MachinePointerInfo(TrmpAddr, 1), 9734 false, false, 1); 9735 9736 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 9737 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9738 DAG.getConstant(5, MVT::i32)); 9739 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 9740 MachinePointerInfo(TrmpAddr, 5), 9741 false, false, 1); 9742 9743 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 9744 DAG.getConstant(6, MVT::i32)); 9745 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 9746 MachinePointerInfo(TrmpAddr, 6), 9747 false, false, 1); 9748 9749 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); 9750 } 9751} 9752 9753SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 9754 SelectionDAG &DAG) const { 9755 /* 9756 The rounding mode is in bits 11:10 of FPSR, and has the following 9757 settings: 9758 00 Round to nearest 9759 01 Round to -inf 9760 10 Round to +inf 9761 11 Round to 0 9762 9763 FLT_ROUNDS, on the other hand, expects the following: 9764 -1 Undefined 9765 0 Round to 0 9766 1 Round to nearest 9767 2 Round to +inf 9768 3 Round to -inf 9769 9770 To perform the conversion, we do: 9771 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 9772 */ 9773 9774 MachineFunction &MF = DAG.getMachineFunction(); 9775 const TargetMachine &TM = MF.getTarget(); 9776 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 9777 unsigned StackAlignment = TFI.getStackAlignment(); 9778 EVT VT = Op.getValueType(); 9779 DebugLoc DL = Op.getDebugLoc(); 9780 9781 // Save FP Control Word to stack slot 9782 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 9783 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 9784 9785 9786 MachineMemOperand *MMO = 9787 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 9788 MachineMemOperand::MOStore, 2, 2); 9789 9790 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 9791 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 9792 DAG.getVTList(MVT::Other), 9793 Ops, 2, MVT::i16, MMO); 9794 9795 // Load FP Control Word from stack slot 9796 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 9797 MachinePointerInfo(), false, false, false, 0); 9798 9799 // Transform as necessary 9800 SDValue CWD1 = 9801 DAG.getNode(ISD::SRL, DL, MVT::i16, 9802 DAG.getNode(ISD::AND, DL, MVT::i16, 9803 CWD, DAG.getConstant(0x800, MVT::i16)), 9804 DAG.getConstant(11, MVT::i8)); 9805 SDValue CWD2 = 9806 DAG.getNode(ISD::SRL, DL, MVT::i16, 9807 DAG.getNode(ISD::AND, DL, MVT::i16, 9808 CWD, DAG.getConstant(0x400, MVT::i16)), 9809 DAG.getConstant(9, MVT::i8)); 9810 9811 SDValue RetVal = 9812 DAG.getNode(ISD::AND, DL, MVT::i16, 9813 DAG.getNode(ISD::ADD, DL, MVT::i16, 9814 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 9815 DAG.getConstant(1, MVT::i16)), 9816 DAG.getConstant(3, MVT::i16)); 9817 9818 9819 return DAG.getNode((VT.getSizeInBits() < 16 ? 9820 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 9821} 9822 9823SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 9824 EVT VT = Op.getValueType(); 9825 EVT OpVT = VT; 9826 unsigned NumBits = VT.getSizeInBits(); 9827 DebugLoc dl = Op.getDebugLoc(); 9828 9829 Op = Op.getOperand(0); 9830 if (VT == MVT::i8) { 9831 // Zero extend to i32 since there is not an i8 bsr. 9832 OpVT = MVT::i32; 9833 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 9834 } 9835 9836 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 9837 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 9838 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 9839 9840 // If src is zero (i.e. bsr sets ZF), returns NumBits. 9841 SDValue Ops[] = { 9842 Op, 9843 DAG.getConstant(NumBits+NumBits-1, OpVT), 9844 DAG.getConstant(X86::COND_E, MVT::i8), 9845 Op.getValue(1) 9846 }; 9847 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 9848 9849 // Finally xor with NumBits-1. 9850 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 9851 9852 if (VT == MVT::i8) 9853 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 9854 return Op; 9855} 9856 9857SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, 9858 SelectionDAG &DAG) const { 9859 EVT VT = Op.getValueType(); 9860 EVT OpVT = VT; 9861 unsigned NumBits = VT.getSizeInBits(); 9862 DebugLoc dl = Op.getDebugLoc(); 9863 9864 Op = Op.getOperand(0); 9865 if (VT == MVT::i8) { 9866 // Zero extend to i32 since there is not an i8 bsr. 9867 OpVT = MVT::i32; 9868 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 9869 } 9870 9871 // Issue a bsr (scan bits in reverse). 9872 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 9873 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 9874 9875 // And xor with NumBits-1. 9876 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 9877 9878 if (VT == MVT::i8) 9879 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 9880 return Op; 9881} 9882 9883SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 9884 EVT VT = Op.getValueType(); 9885 unsigned NumBits = VT.getSizeInBits(); 9886 DebugLoc dl = Op.getDebugLoc(); 9887 Op = Op.getOperand(0); 9888 9889 // Issue a bsf (scan bits forward) which also sets EFLAGS. 9890 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9891 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 9892 9893 // If src is zero (i.e. bsf sets ZF), returns NumBits. 9894 SDValue Ops[] = { 9895 Op, 9896 DAG.getConstant(NumBits, VT), 9897 DAG.getConstant(X86::COND_E, MVT::i8), 9898 Op.getValue(1) 9899 }; 9900 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); 9901} 9902 9903// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 9904// ones, and then concatenate the result back. 9905static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 9906 EVT VT = Op.getValueType(); 9907 9908 assert(VT.getSizeInBits() == 256 && VT.isInteger() && 9909 "Unsupported value type for operation"); 9910 9911 int NumElems = VT.getVectorNumElements(); 9912 DebugLoc dl = Op.getDebugLoc(); 9913 SDValue Idx0 = DAG.getConstant(0, MVT::i32); 9914 SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); 9915 9916 // Extract the LHS vectors 9917 SDValue LHS = Op.getOperand(0); 9918 SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); 9919 SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); 9920 9921 // Extract the RHS vectors 9922 SDValue RHS = Op.getOperand(1); 9923 SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); 9924 SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); 9925 9926 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 9927 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 9928 9929 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 9930 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 9931 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 9932} 9933 9934SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const { 9935 assert(Op.getValueType().getSizeInBits() == 256 && 9936 Op.getValueType().isInteger() && 9937 "Only handle AVX 256-bit vector integer operation"); 9938 return Lower256IntArith(Op, DAG); 9939} 9940 9941SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const { 9942 assert(Op.getValueType().getSizeInBits() == 256 && 9943 Op.getValueType().isInteger() && 9944 "Only handle AVX 256-bit vector integer operation"); 9945 return Lower256IntArith(Op, DAG); 9946} 9947 9948SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 9949 EVT VT = Op.getValueType(); 9950 9951 // Decompose 256-bit ops into smaller 128-bit ops. 9952 if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) 9953 return Lower256IntArith(Op, DAG); 9954 9955 DebugLoc dl = Op.getDebugLoc(); 9956 9957 SDValue A = Op.getOperand(0); 9958 SDValue B = Op.getOperand(1); 9959 9960 if (VT == MVT::v4i64) { 9961 assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2"); 9962 9963 // ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32); 9964 // ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32); 9965 // ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b ); 9966 // ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi ); 9967 // ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b ); 9968 // 9969 // AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 ); 9970 // AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 ); 9971 // return AloBlo + AloBhi + AhiBlo; 9972 9973 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9974 DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), 9975 A, DAG.getConstant(32, MVT::i32)); 9976 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9977 DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), 9978 B, DAG.getConstant(32, MVT::i32)); 9979 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9980 DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), 9981 A, B); 9982 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9983 DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), 9984 A, Bhi); 9985 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9986 DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32), 9987 Ahi, B); 9988 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9989 DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), 9990 AloBhi, DAG.getConstant(32, MVT::i32)); 9991 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9992 DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), 9993 AhiBlo, DAG.getConstant(32, MVT::i32)); 9994 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 9995 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 9996 return Res; 9997 } 9998 9999 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 10000 10001 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 10002 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 10003 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 10004 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 10005 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 10006 // 10007 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 10008 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 10009 // return AloBlo + AloBhi + AhiBlo; 10010 10011 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10012 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 10013 A, DAG.getConstant(32, MVT::i32)); 10014 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10015 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 10016 B, DAG.getConstant(32, MVT::i32)); 10017 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10018 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 10019 A, B); 10020 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10021 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 10022 A, Bhi); 10023 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10024 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 10025 Ahi, B); 10026 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10027 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 10028 AloBhi, DAG.getConstant(32, MVT::i32)); 10029 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10030 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 10031 AhiBlo, DAG.getConstant(32, MVT::i32)); 10032 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 10033 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 10034 return Res; 10035} 10036 10037SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 10038 10039 EVT VT = Op.getValueType(); 10040 DebugLoc dl = Op.getDebugLoc(); 10041 SDValue R = Op.getOperand(0); 10042 SDValue Amt = Op.getOperand(1); 10043 LLVMContext *Context = DAG.getContext(); 10044 10045 if (!Subtarget->hasXMMInt()) 10046 return SDValue(); 10047 10048 // Optimize shl/srl/sra with constant shift amount. 10049 if (isSplatVector(Amt.getNode())) { 10050 SDValue SclrAmt = Amt->getOperand(0); 10051 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 10052 uint64_t ShiftAmt = C->getZExtValue(); 10053 10054 if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SHL) { 10055 // Make a large shift. 10056 SDValue SHL = 10057 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10058 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 10059 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10060 // Zero out the rightmost bits. 10061 SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U << ShiftAmt), 10062 MVT::i8)); 10063 return DAG.getNode(ISD::AND, dl, VT, SHL, 10064 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 10065 } 10066 10067 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) 10068 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10069 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 10070 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10071 10072 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) 10073 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10074 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 10075 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10076 10077 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) 10078 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10079 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 10080 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10081 10082 if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRL) { 10083 // Make a large shift. 10084 SDValue SRL = 10085 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10086 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 10087 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10088 // Zero out the leftmost bits. 10089 SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 10090 MVT::i8)); 10091 return DAG.getNode(ISD::AND, dl, VT, SRL, 10092 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); 10093 } 10094 10095 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) 10096 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10097 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 10098 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10099 10100 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) 10101 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10102 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 10103 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10104 10105 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) 10106 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10107 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 10108 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10109 10110 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) 10111 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10112 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 10113 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10114 10115 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) 10116 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10117 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 10118 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10119 10120 if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRA) { 10121 if (ShiftAmt == 7) { 10122 // R s>> 7 === R s< 0 10123 SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); 10124 return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R); 10125 } 10126 10127 // R s>> a === ((R u>> a) ^ m) - m 10128 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 10129 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, 10130 MVT::i8)); 10131 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); 10132 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 10133 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 10134 return Res; 10135 } 10136 10137 if (Subtarget->hasAVX2() && VT == MVT::v32i8) { 10138 if (Op.getOpcode() == ISD::SHL) { 10139 // Make a large shift. 10140 SDValue SHL = 10141 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10142 DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32), 10143 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10144 // Zero out the rightmost bits. 10145 SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U << ShiftAmt), 10146 MVT::i8)); 10147 return DAG.getNode(ISD::AND, dl, VT, SHL, 10148 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 10149 } 10150 if (Op.getOpcode() == ISD::SRL) { 10151 // Make a large shift. 10152 SDValue SRL = 10153 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10154 DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32), 10155 R, DAG.getConstant(ShiftAmt, MVT::i32)); 10156 // Zero out the leftmost bits. 10157 SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 10158 MVT::i8)); 10159 return DAG.getNode(ISD::AND, dl, VT, SRL, 10160 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); 10161 } 10162 if (Op.getOpcode() == ISD::SRA) { 10163 if (ShiftAmt == 7) { 10164 // R s>> 7 === R s< 0 10165 SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); 10166 return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R); 10167 } 10168 10169 // R s>> a === ((R u>> a) ^ m) - m 10170 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 10171 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, 10172 MVT::i8)); 10173 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); 10174 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 10175 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 10176 return Res; 10177 } 10178 } 10179 } 10180 } 10181 10182 // Lower SHL with variable shift amount. 10183 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 10184 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10185 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 10186 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 10187 10188 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 10189 10190 std::vector<Constant*> CV(4, CI); 10191 Constant *C = ConstantVector::get(CV); 10192 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 10193 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 10194 MachinePointerInfo::getConstantPool(), 10195 false, false, false, 16); 10196 10197 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 10198 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 10199 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 10200 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 10201 } 10202 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 10203 assert((Subtarget->hasSSE2() || Subtarget->hasAVX()) && 10204 "Need SSE2 for pslli/pcmpeq."); 10205 10206 // a = a << 5; 10207 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10208 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 10209 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 10210 10211 // Turn 'a' into a mask suitable for VSELECT 10212 SDValue VSelM = DAG.getConstant(0x80, VT); 10213 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10214 OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10215 DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32), 10216 OpVSel, VSelM); 10217 10218 SDValue CM1 = DAG.getConstant(0x0f, VT); 10219 SDValue CM2 = DAG.getConstant(0x3f, VT); 10220 10221 // r = VSELECT(r, psllw(r & (char16)15, 4), a); 10222 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); 10223 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10224 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 10225 DAG.getConstant(4, MVT::i32)); 10226 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 10227 10228 // a += a 10229 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 10230 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10231 OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10232 DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32), 10233 OpVSel, VSelM); 10234 10235 // r = VSELECT(r, psllw(r & (char16)63, 2), a); 10236 M = DAG.getNode(ISD::AND, dl, VT, R, CM2); 10237 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10238 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 10239 DAG.getConstant(2, MVT::i32)); 10240 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 10241 10242 // a += a 10243 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 10244 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 10245 OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10246 DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32), 10247 OpVSel, VSelM); 10248 10249 // return VSELECT(r, r+r, a); 10250 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, 10251 DAG.getNode(ISD::ADD, dl, VT, R, R), R); 10252 return R; 10253 } 10254 10255 // Decompose 256-bit shifts into smaller 128-bit shifts. 10256 if (VT.getSizeInBits() == 256) { 10257 int NumElems = VT.getVectorNumElements(); 10258 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10259 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 10260 10261 // Extract the two vectors 10262 SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); 10263 SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), 10264 DAG, dl); 10265 10266 // Recreate the shift amount vectors 10267 SDValue Amt1, Amt2; 10268 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 10269 // Constant shift amount 10270 SmallVector<SDValue, 4> Amt1Csts; 10271 SmallVector<SDValue, 4> Amt2Csts; 10272 for (int i = 0; i < NumElems/2; ++i) 10273 Amt1Csts.push_back(Amt->getOperand(i)); 10274 for (int i = NumElems/2; i < NumElems; ++i) 10275 Amt2Csts.push_back(Amt->getOperand(i)); 10276 10277 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 10278 &Amt1Csts[0], NumElems/2); 10279 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, 10280 &Amt2Csts[0], NumElems/2); 10281 } else { 10282 // Variable shift amount 10283 Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); 10284 Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), 10285 DAG, dl); 10286 } 10287 10288 // Issue new vector shifts for the smaller types 10289 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 10290 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 10291 10292 // Concatenate the result back 10293 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 10294 } 10295 10296 return SDValue(); 10297} 10298 10299SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 10300 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 10301 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 10302 // looks for this combo and may remove the "setcc" instruction if the "setcc" 10303 // has only one use. 10304 SDNode *N = Op.getNode(); 10305 SDValue LHS = N->getOperand(0); 10306 SDValue RHS = N->getOperand(1); 10307 unsigned BaseOp = 0; 10308 unsigned Cond = 0; 10309 DebugLoc DL = Op.getDebugLoc(); 10310 switch (Op.getOpcode()) { 10311 default: llvm_unreachable("Unknown ovf instruction!"); 10312 case ISD::SADDO: 10313 // A subtract of one will be selected as a INC. Note that INC doesn't 10314 // set CF, so we can't do this for UADDO. 10315 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 10316 if (C->isOne()) { 10317 BaseOp = X86ISD::INC; 10318 Cond = X86::COND_O; 10319 break; 10320 } 10321 BaseOp = X86ISD::ADD; 10322 Cond = X86::COND_O; 10323 break; 10324 case ISD::UADDO: 10325 BaseOp = X86ISD::ADD; 10326 Cond = X86::COND_B; 10327 break; 10328 case ISD::SSUBO: 10329 // A subtract of one will be selected as a DEC. Note that DEC doesn't 10330 // set CF, so we can't do this for USUBO. 10331 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 10332 if (C->isOne()) { 10333 BaseOp = X86ISD::DEC; 10334 Cond = X86::COND_O; 10335 break; 10336 } 10337 BaseOp = X86ISD::SUB; 10338 Cond = X86::COND_O; 10339 break; 10340 case ISD::USUBO: 10341 BaseOp = X86ISD::SUB; 10342 Cond = X86::COND_B; 10343 break; 10344 case ISD::SMULO: 10345 BaseOp = X86ISD::SMUL; 10346 Cond = X86::COND_O; 10347 break; 10348 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 10349 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 10350 MVT::i32); 10351 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 10352 10353 SDValue SetCC = 10354 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10355 DAG.getConstant(X86::COND_O, MVT::i32), 10356 SDValue(Sum.getNode(), 2)); 10357 10358 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 10359 } 10360 } 10361 10362 // Also sets EFLAGS. 10363 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 10364 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 10365 10366 SDValue SetCC = 10367 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 10368 DAG.getConstant(Cond, MVT::i32), 10369 SDValue(Sum.getNode(), 1)); 10370 10371 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 10372} 10373 10374SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 10375 SelectionDAG &DAG) const { 10376 DebugLoc dl = Op.getDebugLoc(); 10377 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 10378 EVT VT = Op.getValueType(); 10379 10380 if (Subtarget->hasXMMInt() && VT.isVector()) { 10381 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 10382 ExtraVT.getScalarType().getSizeInBits(); 10383 SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); 10384 10385 unsigned SHLIntrinsicsID = 0; 10386 unsigned SRAIntrinsicsID = 0; 10387 switch (VT.getSimpleVT().SimpleTy) { 10388 default: 10389 return SDValue(); 10390 case MVT::v4i32: 10391 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; 10392 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; 10393 break; 10394 case MVT::v8i16: 10395 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; 10396 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; 10397 break; 10398 case MVT::v8i32: 10399 case MVT::v16i16: 10400 if (!Subtarget->hasAVX()) 10401 return SDValue(); 10402 if (!Subtarget->hasAVX2()) { 10403 // needs to be split 10404 int NumElems = VT.getVectorNumElements(); 10405 SDValue Idx0 = DAG.getConstant(0, MVT::i32); 10406 SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); 10407 10408 // Extract the LHS vectors 10409 SDValue LHS = Op.getOperand(0); 10410 SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); 10411 SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); 10412 10413 MVT EltVT = VT.getVectorElementType().getSimpleVT(); 10414 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 10415 10416 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 10417 int ExtraNumElems = ExtraVT.getVectorNumElements(); 10418 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, 10419 ExtraNumElems/2); 10420 SDValue Extra = DAG.getValueType(ExtraVT); 10421 10422 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); 10423 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); 10424 10425 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; 10426 } 10427 if (VT == MVT::v8i32) { 10428 SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_d; 10429 SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_d; 10430 } else { 10431 SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_w; 10432 SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_w; 10433 } 10434 } 10435 10436 SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10437 DAG.getConstant(SHLIntrinsicsID, MVT::i32), 10438 Op.getOperand(0), ShAmt); 10439 10440 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 10441 DAG.getConstant(SRAIntrinsicsID, MVT::i32), 10442 Tmp1, ShAmt); 10443 } 10444 10445 return SDValue(); 10446} 10447 10448 10449SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 10450 DebugLoc dl = Op.getDebugLoc(); 10451 10452 // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. 10453 // There isn't any reason to disable it if the target processor supports it. 10454 if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) { 10455 SDValue Chain = Op.getOperand(0); 10456 SDValue Zero = DAG.getConstant(0, MVT::i32); 10457 SDValue Ops[] = { 10458 DAG.getRegister(X86::ESP, MVT::i32), // Base 10459 DAG.getTargetConstant(1, MVT::i8), // Scale 10460 DAG.getRegister(0, MVT::i32), // Index 10461 DAG.getTargetConstant(0, MVT::i32), // Disp 10462 DAG.getRegister(0, MVT::i32), // Segment. 10463 Zero, 10464 Chain 10465 }; 10466 SDNode *Res = 10467 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 10468 array_lengthof(Ops)); 10469 return SDValue(Res, 0); 10470 } 10471 10472 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 10473 if (!isDev) 10474 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 10475 10476 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10477 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 10478 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 10479 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 10480 10481 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 10482 if (!Op1 && !Op2 && !Op3 && Op4) 10483 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 10484 10485 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 10486 if (Op1 && !Op2 && !Op3 && !Op4) 10487 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 10488 10489 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 10490 // (MFENCE)>; 10491 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 10492} 10493 10494SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, 10495 SelectionDAG &DAG) const { 10496 DebugLoc dl = Op.getDebugLoc(); 10497 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 10498 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 10499 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 10500 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 10501 10502 // The only fence that needs an instruction is a sequentially-consistent 10503 // cross-thread fence. 10504 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 10505 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 10506 // no-sse2). There isn't any reason to disable it if the target processor 10507 // supports it. 10508 if (Subtarget->hasXMMInt() || Subtarget->is64Bit()) 10509 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 10510 10511 SDValue Chain = Op.getOperand(0); 10512 SDValue Zero = DAG.getConstant(0, MVT::i32); 10513 SDValue Ops[] = { 10514 DAG.getRegister(X86::ESP, MVT::i32), // Base 10515 DAG.getTargetConstant(1, MVT::i8), // Scale 10516 DAG.getRegister(0, MVT::i32), // Index 10517 DAG.getTargetConstant(0, MVT::i32), // Disp 10518 DAG.getRegister(0, MVT::i32), // Segment. 10519 Zero, 10520 Chain 10521 }; 10522 SDNode *Res = 10523 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 10524 array_lengthof(Ops)); 10525 return SDValue(Res, 0); 10526 } 10527 10528 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 10529 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 10530} 10531 10532 10533SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 10534 EVT T = Op.getValueType(); 10535 DebugLoc DL = Op.getDebugLoc(); 10536 unsigned Reg = 0; 10537 unsigned size = 0; 10538 switch(T.getSimpleVT().SimpleTy) { 10539 default: 10540 assert(false && "Invalid value type!"); 10541 case MVT::i8: Reg = X86::AL; size = 1; break; 10542 case MVT::i16: Reg = X86::AX; size = 2; break; 10543 case MVT::i32: Reg = X86::EAX; size = 4; break; 10544 case MVT::i64: 10545 assert(Subtarget->is64Bit() && "Node not type legal!"); 10546 Reg = X86::RAX; size = 8; 10547 break; 10548 } 10549 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 10550 Op.getOperand(2), SDValue()); 10551 SDValue Ops[] = { cpIn.getValue(0), 10552 Op.getOperand(1), 10553 Op.getOperand(3), 10554 DAG.getTargetConstant(size, MVT::i8), 10555 cpIn.getValue(1) }; 10556 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10557 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 10558 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 10559 Ops, 5, T, MMO); 10560 SDValue cpOut = 10561 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 10562 return cpOut; 10563} 10564 10565SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 10566 SelectionDAG &DAG) const { 10567 assert(Subtarget->is64Bit() && "Result not type legalized?"); 10568 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10569 SDValue TheChain = Op.getOperand(0); 10570 DebugLoc dl = Op.getDebugLoc(); 10571 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 10572 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 10573 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 10574 rax.getValue(2)); 10575 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 10576 DAG.getConstant(32, MVT::i8)); 10577 SDValue Ops[] = { 10578 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 10579 rdx.getValue(1) 10580 }; 10581 return DAG.getMergeValues(Ops, 2, dl); 10582} 10583 10584SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 10585 SelectionDAG &DAG) const { 10586 EVT SrcVT = Op.getOperand(0).getValueType(); 10587 EVT DstVT = Op.getValueType(); 10588 assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() && 10589 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 10590 assert((DstVT == MVT::i64 || 10591 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 10592 "Unexpected custom BITCAST"); 10593 // i64 <=> MMX conversions are Legal. 10594 if (SrcVT==MVT::i64 && DstVT.isVector()) 10595 return Op; 10596 if (DstVT==MVT::i64 && SrcVT.isVector()) 10597 return Op; 10598 // MMX <=> MMX conversions are Legal. 10599 if (SrcVT.isVector() && DstVT.isVector()) 10600 return Op; 10601 // All other conversions need to be expanded. 10602 return SDValue(); 10603} 10604 10605SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 10606 SDNode *Node = Op.getNode(); 10607 DebugLoc dl = Node->getDebugLoc(); 10608 EVT T = Node->getValueType(0); 10609 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 10610 DAG.getConstant(0, T), Node->getOperand(2)); 10611 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 10612 cast<AtomicSDNode>(Node)->getMemoryVT(), 10613 Node->getOperand(0), 10614 Node->getOperand(1), negOp, 10615 cast<AtomicSDNode>(Node)->getSrcValue(), 10616 cast<AtomicSDNode>(Node)->getAlignment(), 10617 cast<AtomicSDNode>(Node)->getOrdering(), 10618 cast<AtomicSDNode>(Node)->getSynchScope()); 10619} 10620 10621static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 10622 SDNode *Node = Op.getNode(); 10623 DebugLoc dl = Node->getDebugLoc(); 10624 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 10625 10626 // Convert seq_cst store -> xchg 10627 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 10628 // FIXME: On 32-bit, store -> fist or movq would be more efficient 10629 // (The only way to get a 16-byte store is cmpxchg16b) 10630 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 10631 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 10632 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 10633 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 10634 cast<AtomicSDNode>(Node)->getMemoryVT(), 10635 Node->getOperand(0), 10636 Node->getOperand(1), Node->getOperand(2), 10637 cast<AtomicSDNode>(Node)->getMemOperand(), 10638 cast<AtomicSDNode>(Node)->getOrdering(), 10639 cast<AtomicSDNode>(Node)->getSynchScope()); 10640 return Swap.getValue(1); 10641 } 10642 // Other atomic stores have a simple pattern. 10643 return Op; 10644} 10645 10646static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 10647 EVT VT = Op.getNode()->getValueType(0); 10648 10649 // Let legalize expand this if it isn't a legal type yet. 10650 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 10651 return SDValue(); 10652 10653 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 10654 10655 unsigned Opc; 10656 bool ExtraOp = false; 10657 switch (Op.getOpcode()) { 10658 default: assert(0 && "Invalid code"); 10659 case ISD::ADDC: Opc = X86ISD::ADD; break; 10660 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 10661 case ISD::SUBC: Opc = X86ISD::SUB; break; 10662 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 10663 } 10664 10665 if (!ExtraOp) 10666 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 10667 Op.getOperand(1)); 10668 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 10669 Op.getOperand(1), Op.getOperand(2)); 10670} 10671 10672/// LowerOperation - Provide custom lowering hooks for some operations. 10673/// 10674SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 10675 switch (Op.getOpcode()) { 10676 default: llvm_unreachable("Should not custom lower this!"); 10677 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 10678 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 10679 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); 10680 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 10681 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 10682 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 10683 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 10684 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 10685 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 10686 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 10687 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 10688 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 10689 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 10690 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 10691 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 10692 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 10693 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 10694 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 10695 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 10696 case ISD::SHL_PARTS: 10697 case ISD::SRA_PARTS: 10698 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 10699 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 10700 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 10701 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 10702 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 10703 case ISD::FABS: return LowerFABS(Op, DAG); 10704 case ISD::FNEG: return LowerFNEG(Op, DAG); 10705 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 10706 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 10707 case ISD::SETCC: return LowerSETCC(Op, DAG); 10708 case ISD::SELECT: return LowerSELECT(Op, DAG); 10709 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 10710 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 10711 case ISD::VASTART: return LowerVASTART(Op, DAG); 10712 case ISD::VAARG: return LowerVAARG(Op, DAG); 10713 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 10714 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 10715 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 10716 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 10717 case ISD::FRAME_TO_ARGS_OFFSET: 10718 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 10719 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 10720 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 10721 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 10722 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 10723 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 10724 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 10725 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); 10726 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 10727 case ISD::MUL: return LowerMUL(Op, DAG); 10728 case ISD::SRA: 10729 case ISD::SRL: 10730 case ISD::SHL: return LowerShift(Op, DAG); 10731 case ISD::SADDO: 10732 case ISD::UADDO: 10733 case ISD::SSUBO: 10734 case ISD::USUBO: 10735 case ISD::SMULO: 10736 case ISD::UMULO: return LowerXALUO(Op, DAG); 10737 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 10738 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 10739 case ISD::ADDC: 10740 case ISD::ADDE: 10741 case ISD::SUBC: 10742 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 10743 case ISD::ADD: return LowerADD(Op, DAG); 10744 case ISD::SUB: return LowerSUB(Op, DAG); 10745 } 10746} 10747 10748static void ReplaceATOMIC_LOAD(SDNode *Node, 10749 SmallVectorImpl<SDValue> &Results, 10750 SelectionDAG &DAG) { 10751 DebugLoc dl = Node->getDebugLoc(); 10752 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 10753 10754 // Convert wide load -> cmpxchg8b/cmpxchg16b 10755 // FIXME: On 32-bit, load -> fild or movq would be more efficient 10756 // (The only way to get a 16-byte load is cmpxchg16b) 10757 // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. 10758 SDValue Zero = DAG.getConstant(0, VT); 10759 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, 10760 Node->getOperand(0), 10761 Node->getOperand(1), Zero, Zero, 10762 cast<AtomicSDNode>(Node)->getMemOperand(), 10763 cast<AtomicSDNode>(Node)->getOrdering(), 10764 cast<AtomicSDNode>(Node)->getSynchScope()); 10765 Results.push_back(Swap.getValue(0)); 10766 Results.push_back(Swap.getValue(1)); 10767} 10768 10769void X86TargetLowering:: 10770ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 10771 SelectionDAG &DAG, unsigned NewOp) const { 10772 DebugLoc dl = Node->getDebugLoc(); 10773 assert (Node->getValueType(0) == MVT::i64 && 10774 "Only know how to expand i64 atomics"); 10775 10776 SDValue Chain = Node->getOperand(0); 10777 SDValue In1 = Node->getOperand(1); 10778 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10779 Node->getOperand(2), DAG.getIntPtrConstant(0)); 10780 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 10781 Node->getOperand(2), DAG.getIntPtrConstant(1)); 10782 SDValue Ops[] = { Chain, In1, In2L, In2H }; 10783 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 10784 SDValue Result = 10785 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 10786 cast<MemSDNode>(Node)->getMemOperand()); 10787 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 10788 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 10789 Results.push_back(Result.getValue(2)); 10790} 10791 10792/// ReplaceNodeResults - Replace a node with an illegal result type 10793/// with a new node built out of custom code. 10794void X86TargetLowering::ReplaceNodeResults(SDNode *N, 10795 SmallVectorImpl<SDValue>&Results, 10796 SelectionDAG &DAG) const { 10797 DebugLoc dl = N->getDebugLoc(); 10798 switch (N->getOpcode()) { 10799 default: 10800 assert(false && "Do not know how to custom type legalize this operation!"); 10801 return; 10802 case ISD::SIGN_EXTEND_INREG: 10803 case ISD::ADDC: 10804 case ISD::ADDE: 10805 case ISD::SUBC: 10806 case ISD::SUBE: 10807 // We don't want to expand or promote these. 10808 return; 10809 case ISD::FP_TO_SINT: { 10810 std::pair<SDValue,SDValue> Vals = 10811 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 10812 SDValue FIST = Vals.first, StackSlot = Vals.second; 10813 if (FIST.getNode() != 0) { 10814 EVT VT = N->getValueType(0); 10815 // Return a load from the stack slot. 10816 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 10817 MachinePointerInfo(), 10818 false, false, false, 0)); 10819 } 10820 return; 10821 } 10822 case ISD::READCYCLECOUNTER: { 10823 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10824 SDValue TheChain = N->getOperand(0); 10825 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 10826 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 10827 rd.getValue(1)); 10828 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 10829 eax.getValue(2)); 10830 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 10831 SDValue Ops[] = { eax, edx }; 10832 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 10833 Results.push_back(edx.getValue(1)); 10834 return; 10835 } 10836 case ISD::ATOMIC_CMP_SWAP: { 10837 EVT T = N->getValueType(0); 10838 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 10839 bool Regs64bit = T == MVT::i128; 10840 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 10841 SDValue cpInL, cpInH; 10842 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 10843 DAG.getConstant(0, HalfT)); 10844 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 10845 DAG.getConstant(1, HalfT)); 10846 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 10847 Regs64bit ? X86::RAX : X86::EAX, 10848 cpInL, SDValue()); 10849 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 10850 Regs64bit ? X86::RDX : X86::EDX, 10851 cpInH, cpInL.getValue(1)); 10852 SDValue swapInL, swapInH; 10853 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 10854 DAG.getConstant(0, HalfT)); 10855 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 10856 DAG.getConstant(1, HalfT)); 10857 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 10858 Regs64bit ? X86::RBX : X86::EBX, 10859 swapInL, cpInH.getValue(1)); 10860 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 10861 Regs64bit ? X86::RCX : X86::ECX, 10862 swapInH, swapInL.getValue(1)); 10863 SDValue Ops[] = { swapInH.getValue(0), 10864 N->getOperand(1), 10865 swapInH.getValue(1) }; 10866 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 10867 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 10868 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 10869 X86ISD::LCMPXCHG8_DAG; 10870 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, 10871 Ops, 3, T, MMO); 10872 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 10873 Regs64bit ? X86::RAX : X86::EAX, 10874 HalfT, Result.getValue(1)); 10875 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 10876 Regs64bit ? X86::RDX : X86::EDX, 10877 HalfT, cpOutL.getValue(2)); 10878 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 10879 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); 10880 Results.push_back(cpOutH.getValue(1)); 10881 return; 10882 } 10883 case ISD::ATOMIC_LOAD_ADD: 10884 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 10885 return; 10886 case ISD::ATOMIC_LOAD_AND: 10887 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 10888 return; 10889 case ISD::ATOMIC_LOAD_NAND: 10890 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 10891 return; 10892 case ISD::ATOMIC_LOAD_OR: 10893 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 10894 return; 10895 case ISD::ATOMIC_LOAD_SUB: 10896 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 10897 return; 10898 case ISD::ATOMIC_LOAD_XOR: 10899 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 10900 return; 10901 case ISD::ATOMIC_SWAP: 10902 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 10903 return; 10904 case ISD::ATOMIC_LOAD: 10905 ReplaceATOMIC_LOAD(N, Results, DAG); 10906 } 10907} 10908 10909const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 10910 switch (Opcode) { 10911 default: return NULL; 10912 case X86ISD::BSF: return "X86ISD::BSF"; 10913 case X86ISD::BSR: return "X86ISD::BSR"; 10914 case X86ISD::SHLD: return "X86ISD::SHLD"; 10915 case X86ISD::SHRD: return "X86ISD::SHRD"; 10916 case X86ISD::FAND: return "X86ISD::FAND"; 10917 case X86ISD::FOR: return "X86ISD::FOR"; 10918 case X86ISD::FXOR: return "X86ISD::FXOR"; 10919 case X86ISD::FSRL: return "X86ISD::FSRL"; 10920 case X86ISD::FILD: return "X86ISD::FILD"; 10921 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 10922 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 10923 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 10924 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 10925 case X86ISD::FLD: return "X86ISD::FLD"; 10926 case X86ISD::FST: return "X86ISD::FST"; 10927 case X86ISD::CALL: return "X86ISD::CALL"; 10928 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 10929 case X86ISD::BT: return "X86ISD::BT"; 10930 case X86ISD::CMP: return "X86ISD::CMP"; 10931 case X86ISD::COMI: return "X86ISD::COMI"; 10932 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 10933 case X86ISD::SETCC: return "X86ISD::SETCC"; 10934 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 10935 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 10936 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 10937 case X86ISD::CMOV: return "X86ISD::CMOV"; 10938 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 10939 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 10940 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 10941 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 10942 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 10943 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 10944 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 10945 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 10946 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 10947 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 10948 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 10949 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 10950 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 10951 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 10952 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 10953 case X86ISD::BLENDV: return "X86ISD::BLENDV"; 10954 case X86ISD::HADD: return "X86ISD::HADD"; 10955 case X86ISD::HSUB: return "X86ISD::HSUB"; 10956 case X86ISD::FHADD: return "X86ISD::FHADD"; 10957 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 10958 case X86ISD::FMAX: return "X86ISD::FMAX"; 10959 case X86ISD::FMIN: return "X86ISD::FMIN"; 10960 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 10961 case X86ISD::FRCP: return "X86ISD::FRCP"; 10962 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 10963 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 10964 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 10965 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 10966 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 10967 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 10968 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 10969 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 10970 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 10971 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 10972 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 10973 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 10974 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 10975 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 10976 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 10977 case X86ISD::VSHL: return "X86ISD::VSHL"; 10978 case X86ISD::VSRL: return "X86ISD::VSRL"; 10979 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 10980 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 10981 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 10982 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 10983 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 10984 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 10985 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 10986 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 10987 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 10988 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 10989 case X86ISD::ADD: return "X86ISD::ADD"; 10990 case X86ISD::SUB: return "X86ISD::SUB"; 10991 case X86ISD::ADC: return "X86ISD::ADC"; 10992 case X86ISD::SBB: return "X86ISD::SBB"; 10993 case X86ISD::SMUL: return "X86ISD::SMUL"; 10994 case X86ISD::UMUL: return "X86ISD::UMUL"; 10995 case X86ISD::INC: return "X86ISD::INC"; 10996 case X86ISD::DEC: return "X86ISD::DEC"; 10997 case X86ISD::OR: return "X86ISD::OR"; 10998 case X86ISD::XOR: return "X86ISD::XOR"; 10999 case X86ISD::AND: return "X86ISD::AND"; 11000 case X86ISD::ANDN: return "X86ISD::ANDN"; 11001 case X86ISD::BLSI: return "X86ISD::BLSI"; 11002 case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; 11003 case X86ISD::BLSR: return "X86ISD::BLSR"; 11004 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 11005 case X86ISD::PTEST: return "X86ISD::PTEST"; 11006 case X86ISD::TESTP: return "X86ISD::TESTP"; 11007 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 11008 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 11009 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 11010 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 11011 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 11012 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 11013 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 11014 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 11015 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 11016 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 11017 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 11018 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 11019 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 11020 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 11021 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 11022 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 11023 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 11024 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 11025 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 11026 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 11027 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 11028 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 11029 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 11030 case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; 11031 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 11032 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 11033 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 11034 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 11035 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 11036 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 11037 } 11038} 11039 11040// isLegalAddressingMode - Return true if the addressing mode represented 11041// by AM is legal for this target, for a load/store of the specified type. 11042bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 11043 Type *Ty) const { 11044 // X86 supports extremely general addressing modes. 11045 CodeModel::Model M = getTargetMachine().getCodeModel(); 11046 Reloc::Model R = getTargetMachine().getRelocationModel(); 11047 11048 // X86 allows a sign-extended 32-bit immediate field as a displacement. 11049 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 11050 return false; 11051 11052 if (AM.BaseGV) { 11053 unsigned GVFlags = 11054 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 11055 11056 // If a reference to this global requires an extra load, we can't fold it. 11057 if (isGlobalStubReference(GVFlags)) 11058 return false; 11059 11060 // If BaseGV requires a register for the PIC base, we cannot also have a 11061 // BaseReg specified. 11062 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 11063 return false; 11064 11065 // If lower 4G is not available, then we must use rip-relative addressing. 11066 if ((M != CodeModel::Small || R != Reloc::Static) && 11067 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 11068 return false; 11069 } 11070 11071 switch (AM.Scale) { 11072 case 0: 11073 case 1: 11074 case 2: 11075 case 4: 11076 case 8: 11077 // These scales always work. 11078 break; 11079 case 3: 11080 case 5: 11081 case 9: 11082 // These scales are formed with basereg+scalereg. Only accept if there is 11083 // no basereg yet. 11084 if (AM.HasBaseReg) 11085 return false; 11086 break; 11087 default: // Other stuff never works. 11088 return false; 11089 } 11090 11091 return true; 11092} 11093 11094 11095bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 11096 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 11097 return false; 11098 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 11099 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 11100 if (NumBits1 <= NumBits2) 11101 return false; 11102 return true; 11103} 11104 11105bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 11106 if (!VT1.isInteger() || !VT2.isInteger()) 11107 return false; 11108 unsigned NumBits1 = VT1.getSizeInBits(); 11109 unsigned NumBits2 = VT2.getSizeInBits(); 11110 if (NumBits1 <= NumBits2) 11111 return false; 11112 return true; 11113} 11114 11115bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 11116 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 11117 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 11118} 11119 11120bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 11121 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 11122 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 11123} 11124 11125bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 11126 // i16 instructions are longer (0x66 prefix) and potentially slower. 11127 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 11128} 11129 11130/// isShuffleMaskLegal - Targets can use this to indicate that they only 11131/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 11132/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 11133/// are assumed to be legal. 11134bool 11135X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 11136 EVT VT) const { 11137 // Very little shuffling can be done for 64-bit vectors right now. 11138 if (VT.getSizeInBits() == 64) 11139 return false; 11140 11141 // FIXME: pshufb, blends, shifts. 11142 return (VT.getVectorNumElements() == 2 || 11143 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 11144 isMOVLMask(M, VT) || 11145 isSHUFPMask(M, VT) || 11146 isPSHUFDMask(M, VT) || 11147 isPSHUFHWMask(M, VT) || 11148 isPSHUFLWMask(M, VT) || 11149 isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()) || 11150 isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || 11151 isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || 11152 isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) || 11153 isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2())); 11154} 11155 11156bool 11157X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 11158 EVT VT) const { 11159 unsigned NumElts = VT.getVectorNumElements(); 11160 // FIXME: This collection of masks seems suspect. 11161 if (NumElts == 2) 11162 return true; 11163 if (NumElts == 4 && VT.getSizeInBits() == 128) { 11164 return (isMOVLMask(Mask, VT) || 11165 isCommutedMOVLMask(Mask, VT, true) || 11166 isSHUFPMask(Mask, VT) || 11167 isSHUFPMask(Mask, VT, /* Commuted */ true)); 11168 } 11169 return false; 11170} 11171 11172//===----------------------------------------------------------------------===// 11173// X86 Scheduler Hooks 11174//===----------------------------------------------------------------------===// 11175 11176// private utility function 11177MachineBasicBlock * 11178X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 11179 MachineBasicBlock *MBB, 11180 unsigned regOpc, 11181 unsigned immOpc, 11182 unsigned LoadOpc, 11183 unsigned CXchgOpc, 11184 unsigned notOpc, 11185 unsigned EAXreg, 11186 TargetRegisterClass *RC, 11187 bool invSrc) const { 11188 // For the atomic bitwise operator, we generate 11189 // thisMBB: 11190 // newMBB: 11191 // ld t1 = [bitinstr.addr] 11192 // op t2 = t1, [bitinstr.val] 11193 // mov EAX = t1 11194 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 11195 // bz newMBB 11196 // fallthrough -->nextMBB 11197 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11198 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11199 MachineFunction::iterator MBBIter = MBB; 11200 ++MBBIter; 11201 11202 /// First build the CFG 11203 MachineFunction *F = MBB->getParent(); 11204 MachineBasicBlock *thisMBB = MBB; 11205 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 11206 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 11207 F->insert(MBBIter, newMBB); 11208 F->insert(MBBIter, nextMBB); 11209 11210 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 11211 nextMBB->splice(nextMBB->begin(), thisMBB, 11212 llvm::next(MachineBasicBlock::iterator(bInstr)), 11213 thisMBB->end()); 11214 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11215 11216 // Update thisMBB to fall through to newMBB 11217 thisMBB->addSuccessor(newMBB); 11218 11219 // newMBB jumps to itself and fall through to nextMBB 11220 newMBB->addSuccessor(nextMBB); 11221 newMBB->addSuccessor(newMBB); 11222 11223 // Insert instructions into newMBB based on incoming instruction 11224 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 11225 "unexpected number of operands"); 11226 DebugLoc dl = bInstr->getDebugLoc(); 11227 MachineOperand& destOper = bInstr->getOperand(0); 11228 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 11229 int numArgs = bInstr->getNumOperands() - 1; 11230 for (int i=0; i < numArgs; ++i) 11231 argOpers[i] = &bInstr->getOperand(i+1); 11232 11233 // x86 address has 4 operands: base, index, scale, and displacement 11234 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 11235 int valArgIndx = lastAddrIndx + 1; 11236 11237 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 11238 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 11239 for (int i=0; i <= lastAddrIndx; ++i) 11240 (*MIB).addOperand(*argOpers[i]); 11241 11242 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 11243 if (invSrc) { 11244 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 11245 } 11246 else 11247 tt = t1; 11248 11249 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 11250 assert((argOpers[valArgIndx]->isReg() || 11251 argOpers[valArgIndx]->isImm()) && 11252 "invalid operand"); 11253 if (argOpers[valArgIndx]->isReg()) 11254 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 11255 else 11256 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 11257 MIB.addReg(tt); 11258 (*MIB).addOperand(*argOpers[valArgIndx]); 11259 11260 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 11261 MIB.addReg(t1); 11262 11263 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 11264 for (int i=0; i <= lastAddrIndx; ++i) 11265 (*MIB).addOperand(*argOpers[i]); 11266 MIB.addReg(t2); 11267 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 11268 (*MIB).setMemRefs(bInstr->memoperands_begin(), 11269 bInstr->memoperands_end()); 11270 11271 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 11272 MIB.addReg(EAXreg); 11273 11274 // insert branch 11275 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 11276 11277 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 11278 return nextMBB; 11279} 11280 11281// private utility function: 64 bit atomics on 32 bit host. 11282MachineBasicBlock * 11283X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 11284 MachineBasicBlock *MBB, 11285 unsigned regOpcL, 11286 unsigned regOpcH, 11287 unsigned immOpcL, 11288 unsigned immOpcH, 11289 bool invSrc) const { 11290 // For the atomic bitwise operator, we generate 11291 // thisMBB (instructions are in pairs, except cmpxchg8b) 11292 // ld t1,t2 = [bitinstr.addr] 11293 // newMBB: 11294 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 11295 // op t5, t6 <- out1, out2, [bitinstr.val] 11296 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 11297 // mov ECX, EBX <- t5, t6 11298 // mov EAX, EDX <- t1, t2 11299 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 11300 // mov t3, t4 <- EAX, EDX 11301 // bz newMBB 11302 // result in out1, out2 11303 // fallthrough -->nextMBB 11304 11305 const TargetRegisterClass *RC = X86::GR32RegisterClass; 11306 const unsigned LoadOpc = X86::MOV32rm; 11307 const unsigned NotOpc = X86::NOT32r; 11308 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11309 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11310 MachineFunction::iterator MBBIter = MBB; 11311 ++MBBIter; 11312 11313 /// First build the CFG 11314 MachineFunction *F = MBB->getParent(); 11315 MachineBasicBlock *thisMBB = MBB; 11316 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 11317 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 11318 F->insert(MBBIter, newMBB); 11319 F->insert(MBBIter, nextMBB); 11320 11321 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 11322 nextMBB->splice(nextMBB->begin(), thisMBB, 11323 llvm::next(MachineBasicBlock::iterator(bInstr)), 11324 thisMBB->end()); 11325 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11326 11327 // Update thisMBB to fall through to newMBB 11328 thisMBB->addSuccessor(newMBB); 11329 11330 // newMBB jumps to itself and fall through to nextMBB 11331 newMBB->addSuccessor(nextMBB); 11332 newMBB->addSuccessor(newMBB); 11333 11334 DebugLoc dl = bInstr->getDebugLoc(); 11335 // Insert instructions into newMBB based on incoming instruction 11336 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 11337 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 11338 "unexpected number of operands"); 11339 MachineOperand& dest1Oper = bInstr->getOperand(0); 11340 MachineOperand& dest2Oper = bInstr->getOperand(1); 11341 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 11342 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 11343 argOpers[i] = &bInstr->getOperand(i+2); 11344 11345 // We use some of the operands multiple times, so conservatively just 11346 // clear any kill flags that might be present. 11347 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 11348 argOpers[i]->setIsKill(false); 11349 } 11350 11351 // x86 address has 5 operands: base, index, scale, displacement, and segment. 11352 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 11353 11354 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 11355 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 11356 for (int i=0; i <= lastAddrIndx; ++i) 11357 (*MIB).addOperand(*argOpers[i]); 11358 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 11359 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 11360 // add 4 to displacement. 11361 for (int i=0; i <= lastAddrIndx-2; ++i) 11362 (*MIB).addOperand(*argOpers[i]); 11363 MachineOperand newOp3 = *(argOpers[3]); 11364 if (newOp3.isImm()) 11365 newOp3.setImm(newOp3.getImm()+4); 11366 else 11367 newOp3.setOffset(newOp3.getOffset()+4); 11368 (*MIB).addOperand(newOp3); 11369 (*MIB).addOperand(*argOpers[lastAddrIndx]); 11370 11371 // t3/4 are defined later, at the bottom of the loop 11372 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 11373 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 11374 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 11375 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 11376 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 11377 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 11378 11379 // The subsequent operations should be using the destination registers of 11380 //the PHI instructions. 11381 if (invSrc) { 11382 t1 = F->getRegInfo().createVirtualRegister(RC); 11383 t2 = F->getRegInfo().createVirtualRegister(RC); 11384 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 11385 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 11386 } else { 11387 t1 = dest1Oper.getReg(); 11388 t2 = dest2Oper.getReg(); 11389 } 11390 11391 int valArgIndx = lastAddrIndx + 1; 11392 assert((argOpers[valArgIndx]->isReg() || 11393 argOpers[valArgIndx]->isImm()) && 11394 "invalid operand"); 11395 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 11396 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 11397 if (argOpers[valArgIndx]->isReg()) 11398 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 11399 else 11400 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 11401 if (regOpcL != X86::MOV32rr) 11402 MIB.addReg(t1); 11403 (*MIB).addOperand(*argOpers[valArgIndx]); 11404 assert(argOpers[valArgIndx + 1]->isReg() == 11405 argOpers[valArgIndx]->isReg()); 11406 assert(argOpers[valArgIndx + 1]->isImm() == 11407 argOpers[valArgIndx]->isImm()); 11408 if (argOpers[valArgIndx + 1]->isReg()) 11409 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 11410 else 11411 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 11412 if (regOpcH != X86::MOV32rr) 11413 MIB.addReg(t2); 11414 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 11415 11416 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 11417 MIB.addReg(t1); 11418 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 11419 MIB.addReg(t2); 11420 11421 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 11422 MIB.addReg(t5); 11423 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 11424 MIB.addReg(t6); 11425 11426 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 11427 for (int i=0; i <= lastAddrIndx; ++i) 11428 (*MIB).addOperand(*argOpers[i]); 11429 11430 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 11431 (*MIB).setMemRefs(bInstr->memoperands_begin(), 11432 bInstr->memoperands_end()); 11433 11434 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 11435 MIB.addReg(X86::EAX); 11436 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 11437 MIB.addReg(X86::EDX); 11438 11439 // insert branch 11440 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 11441 11442 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 11443 return nextMBB; 11444} 11445 11446// private utility function 11447MachineBasicBlock * 11448X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 11449 MachineBasicBlock *MBB, 11450 unsigned cmovOpc) const { 11451 // For the atomic min/max operator, we generate 11452 // thisMBB: 11453 // newMBB: 11454 // ld t1 = [min/max.addr] 11455 // mov t2 = [min/max.val] 11456 // cmp t1, t2 11457 // cmov[cond] t2 = t1 11458 // mov EAX = t1 11459 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 11460 // bz newMBB 11461 // fallthrough -->nextMBB 11462 // 11463 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11464 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11465 MachineFunction::iterator MBBIter = MBB; 11466 ++MBBIter; 11467 11468 /// First build the CFG 11469 MachineFunction *F = MBB->getParent(); 11470 MachineBasicBlock *thisMBB = MBB; 11471 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 11472 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 11473 F->insert(MBBIter, newMBB); 11474 F->insert(MBBIter, nextMBB); 11475 11476 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 11477 nextMBB->splice(nextMBB->begin(), thisMBB, 11478 llvm::next(MachineBasicBlock::iterator(mInstr)), 11479 thisMBB->end()); 11480 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11481 11482 // Update thisMBB to fall through to newMBB 11483 thisMBB->addSuccessor(newMBB); 11484 11485 // newMBB jumps to newMBB and fall through to nextMBB 11486 newMBB->addSuccessor(nextMBB); 11487 newMBB->addSuccessor(newMBB); 11488 11489 DebugLoc dl = mInstr->getDebugLoc(); 11490 // Insert instructions into newMBB based on incoming instruction 11491 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 11492 "unexpected number of operands"); 11493 MachineOperand& destOper = mInstr->getOperand(0); 11494 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 11495 int numArgs = mInstr->getNumOperands() - 1; 11496 for (int i=0; i < numArgs; ++i) 11497 argOpers[i] = &mInstr->getOperand(i+1); 11498 11499 // x86 address has 4 operands: base, index, scale, and displacement 11500 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 11501 int valArgIndx = lastAddrIndx + 1; 11502 11503 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 11504 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 11505 for (int i=0; i <= lastAddrIndx; ++i) 11506 (*MIB).addOperand(*argOpers[i]); 11507 11508 // We only support register and immediate values 11509 assert((argOpers[valArgIndx]->isReg() || 11510 argOpers[valArgIndx]->isImm()) && 11511 "invalid operand"); 11512 11513 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 11514 if (argOpers[valArgIndx]->isReg()) 11515 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 11516 else 11517 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 11518 (*MIB).addOperand(*argOpers[valArgIndx]); 11519 11520 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 11521 MIB.addReg(t1); 11522 11523 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 11524 MIB.addReg(t1); 11525 MIB.addReg(t2); 11526 11527 // Generate movc 11528 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 11529 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 11530 MIB.addReg(t2); 11531 MIB.addReg(t1); 11532 11533 // Cmp and exchange if none has modified the memory location 11534 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 11535 for (int i=0; i <= lastAddrIndx; ++i) 11536 (*MIB).addOperand(*argOpers[i]); 11537 MIB.addReg(t3); 11538 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 11539 (*MIB).setMemRefs(mInstr->memoperands_begin(), 11540 mInstr->memoperands_end()); 11541 11542 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 11543 MIB.addReg(X86::EAX); 11544 11545 // insert branch 11546 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 11547 11548 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 11549 return nextMBB; 11550} 11551 11552// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 11553// or XMM0_V32I8 in AVX all of this code can be replaced with that 11554// in the .td file. 11555MachineBasicBlock * 11556X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 11557 unsigned numArgs, bool memArg) const { 11558 assert(Subtarget->hasSSE42orAVX() && 11559 "Target must have SSE4.2 or AVX features enabled"); 11560 11561 DebugLoc dl = MI->getDebugLoc(); 11562 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11563 unsigned Opc; 11564 if (!Subtarget->hasAVX()) { 11565 if (memArg) 11566 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 11567 else 11568 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 11569 } else { 11570 if (memArg) 11571 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 11572 else 11573 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 11574 } 11575 11576 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 11577 for (unsigned i = 0; i < numArgs; ++i) { 11578 MachineOperand &Op = MI->getOperand(i+1); 11579 if (!(Op.isReg() && Op.isImplicit())) 11580 MIB.addOperand(Op); 11581 } 11582 BuildMI(*BB, MI, dl, 11583 TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr), 11584 MI->getOperand(0).getReg()) 11585 .addReg(X86::XMM0); 11586 11587 MI->eraseFromParent(); 11588 return BB; 11589} 11590 11591MachineBasicBlock * 11592X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 11593 DebugLoc dl = MI->getDebugLoc(); 11594 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11595 11596 // Address into RAX/EAX, other two args into ECX, EDX. 11597 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 11598 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 11599 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 11600 for (int i = 0; i < X86::AddrNumOperands; ++i) 11601 MIB.addOperand(MI->getOperand(i)); 11602 11603 unsigned ValOps = X86::AddrNumOperands; 11604 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 11605 .addReg(MI->getOperand(ValOps).getReg()); 11606 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 11607 .addReg(MI->getOperand(ValOps+1).getReg()); 11608 11609 // The instruction doesn't actually take any operands though. 11610 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 11611 11612 MI->eraseFromParent(); // The pseudo is gone now. 11613 return BB; 11614} 11615 11616MachineBasicBlock * 11617X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 11618 DebugLoc dl = MI->getDebugLoc(); 11619 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11620 11621 // First arg in ECX, the second in EAX. 11622 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 11623 .addReg(MI->getOperand(0).getReg()); 11624 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 11625 .addReg(MI->getOperand(1).getReg()); 11626 11627 // The instruction doesn't actually take any operands though. 11628 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 11629 11630 MI->eraseFromParent(); // The pseudo is gone now. 11631 return BB; 11632} 11633 11634MachineBasicBlock * 11635X86TargetLowering::EmitVAARG64WithCustomInserter( 11636 MachineInstr *MI, 11637 MachineBasicBlock *MBB) const { 11638 // Emit va_arg instruction on X86-64. 11639 11640 // Operands to this pseudo-instruction: 11641 // 0 ) Output : destination address (reg) 11642 // 1-5) Input : va_list address (addr, i64mem) 11643 // 6 ) ArgSize : Size (in bytes) of vararg type 11644 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 11645 // 8 ) Align : Alignment of type 11646 // 9 ) EFLAGS (implicit-def) 11647 11648 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 11649 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 11650 11651 unsigned DestReg = MI->getOperand(0).getReg(); 11652 MachineOperand &Base = MI->getOperand(1); 11653 MachineOperand &Scale = MI->getOperand(2); 11654 MachineOperand &Index = MI->getOperand(3); 11655 MachineOperand &Disp = MI->getOperand(4); 11656 MachineOperand &Segment = MI->getOperand(5); 11657 unsigned ArgSize = MI->getOperand(6).getImm(); 11658 unsigned ArgMode = MI->getOperand(7).getImm(); 11659 unsigned Align = MI->getOperand(8).getImm(); 11660 11661 // Memory Reference 11662 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 11663 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 11664 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 11665 11666 // Machine Information 11667 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11668 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 11669 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 11670 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 11671 DebugLoc DL = MI->getDebugLoc(); 11672 11673 // struct va_list { 11674 // i32 gp_offset 11675 // i32 fp_offset 11676 // i64 overflow_area (address) 11677 // i64 reg_save_area (address) 11678 // } 11679 // sizeof(va_list) = 24 11680 // alignment(va_list) = 8 11681 11682 unsigned TotalNumIntRegs = 6; 11683 unsigned TotalNumXMMRegs = 8; 11684 bool UseGPOffset = (ArgMode == 1); 11685 bool UseFPOffset = (ArgMode == 2); 11686 unsigned MaxOffset = TotalNumIntRegs * 8 + 11687 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 11688 11689 /* Align ArgSize to a multiple of 8 */ 11690 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 11691 bool NeedsAlign = (Align > 8); 11692 11693 MachineBasicBlock *thisMBB = MBB; 11694 MachineBasicBlock *overflowMBB; 11695 MachineBasicBlock *offsetMBB; 11696 MachineBasicBlock *endMBB; 11697 11698 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 11699 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 11700 unsigned OffsetReg = 0; 11701 11702 if (!UseGPOffset && !UseFPOffset) { 11703 // If we only pull from the overflow region, we don't create a branch. 11704 // We don't need to alter control flow. 11705 OffsetDestReg = 0; // unused 11706 OverflowDestReg = DestReg; 11707 11708 offsetMBB = NULL; 11709 overflowMBB = thisMBB; 11710 endMBB = thisMBB; 11711 } else { 11712 // First emit code to check if gp_offset (or fp_offset) is below the bound. 11713 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 11714 // If not, pull from overflow_area. (branch to overflowMBB) 11715 // 11716 // thisMBB 11717 // | . 11718 // | . 11719 // offsetMBB overflowMBB 11720 // | . 11721 // | . 11722 // endMBB 11723 11724 // Registers for the PHI in endMBB 11725 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 11726 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 11727 11728 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11729 MachineFunction *MF = MBB->getParent(); 11730 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11731 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11732 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 11733 11734 MachineFunction::iterator MBBIter = MBB; 11735 ++MBBIter; 11736 11737 // Insert the new basic blocks 11738 MF->insert(MBBIter, offsetMBB); 11739 MF->insert(MBBIter, overflowMBB); 11740 MF->insert(MBBIter, endMBB); 11741 11742 // Transfer the remainder of MBB and its successor edges to endMBB. 11743 endMBB->splice(endMBB->begin(), thisMBB, 11744 llvm::next(MachineBasicBlock::iterator(MI)), 11745 thisMBB->end()); 11746 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 11747 11748 // Make offsetMBB and overflowMBB successors of thisMBB 11749 thisMBB->addSuccessor(offsetMBB); 11750 thisMBB->addSuccessor(overflowMBB); 11751 11752 // endMBB is a successor of both offsetMBB and overflowMBB 11753 offsetMBB->addSuccessor(endMBB); 11754 overflowMBB->addSuccessor(endMBB); 11755 11756 // Load the offset value into a register 11757 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 11758 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 11759 .addOperand(Base) 11760 .addOperand(Scale) 11761 .addOperand(Index) 11762 .addDisp(Disp, UseFPOffset ? 4 : 0) 11763 .addOperand(Segment) 11764 .setMemRefs(MMOBegin, MMOEnd); 11765 11766 // Check if there is enough room left to pull this argument. 11767 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 11768 .addReg(OffsetReg) 11769 .addImm(MaxOffset + 8 - ArgSizeA8); 11770 11771 // Branch to "overflowMBB" if offset >= max 11772 // Fall through to "offsetMBB" otherwise 11773 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 11774 .addMBB(overflowMBB); 11775 } 11776 11777 // In offsetMBB, emit code to use the reg_save_area. 11778 if (offsetMBB) { 11779 assert(OffsetReg != 0); 11780 11781 // Read the reg_save_area address. 11782 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 11783 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 11784 .addOperand(Base) 11785 .addOperand(Scale) 11786 .addOperand(Index) 11787 .addDisp(Disp, 16) 11788 .addOperand(Segment) 11789 .setMemRefs(MMOBegin, MMOEnd); 11790 11791 // Zero-extend the offset 11792 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 11793 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 11794 .addImm(0) 11795 .addReg(OffsetReg) 11796 .addImm(X86::sub_32bit); 11797 11798 // Add the offset to the reg_save_area to get the final address. 11799 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 11800 .addReg(OffsetReg64) 11801 .addReg(RegSaveReg); 11802 11803 // Compute the offset for the next argument 11804 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 11805 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 11806 .addReg(OffsetReg) 11807 .addImm(UseFPOffset ? 16 : 8); 11808 11809 // Store it back into the va_list. 11810 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 11811 .addOperand(Base) 11812 .addOperand(Scale) 11813 .addOperand(Index) 11814 .addDisp(Disp, UseFPOffset ? 4 : 0) 11815 .addOperand(Segment) 11816 .addReg(NextOffsetReg) 11817 .setMemRefs(MMOBegin, MMOEnd); 11818 11819 // Jump to endMBB 11820 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 11821 .addMBB(endMBB); 11822 } 11823 11824 // 11825 // Emit code to use overflow area 11826 // 11827 11828 // Load the overflow_area address into a register. 11829 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 11830 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 11831 .addOperand(Base) 11832 .addOperand(Scale) 11833 .addOperand(Index) 11834 .addDisp(Disp, 8) 11835 .addOperand(Segment) 11836 .setMemRefs(MMOBegin, MMOEnd); 11837 11838 // If we need to align it, do so. Otherwise, just copy the address 11839 // to OverflowDestReg. 11840 if (NeedsAlign) { 11841 // Align the overflow address 11842 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 11843 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 11844 11845 // aligned_addr = (addr + (align-1)) & ~(align-1) 11846 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 11847 .addReg(OverflowAddrReg) 11848 .addImm(Align-1); 11849 11850 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 11851 .addReg(TmpReg) 11852 .addImm(~(uint64_t)(Align-1)); 11853 } else { 11854 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 11855 .addReg(OverflowAddrReg); 11856 } 11857 11858 // Compute the next overflow address after this argument. 11859 // (the overflow address should be kept 8-byte aligned) 11860 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 11861 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 11862 .addReg(OverflowDestReg) 11863 .addImm(ArgSizeA8); 11864 11865 // Store the new overflow address. 11866 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 11867 .addOperand(Base) 11868 .addOperand(Scale) 11869 .addOperand(Index) 11870 .addDisp(Disp, 8) 11871 .addOperand(Segment) 11872 .addReg(NextAddrReg) 11873 .setMemRefs(MMOBegin, MMOEnd); 11874 11875 // If we branched, emit the PHI to the front of endMBB. 11876 if (offsetMBB) { 11877 BuildMI(*endMBB, endMBB->begin(), DL, 11878 TII->get(X86::PHI), DestReg) 11879 .addReg(OffsetDestReg).addMBB(offsetMBB) 11880 .addReg(OverflowDestReg).addMBB(overflowMBB); 11881 } 11882 11883 // Erase the pseudo instruction 11884 MI->eraseFromParent(); 11885 11886 return endMBB; 11887} 11888 11889MachineBasicBlock * 11890X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 11891 MachineInstr *MI, 11892 MachineBasicBlock *MBB) const { 11893 // Emit code to save XMM registers to the stack. The ABI says that the 11894 // number of registers to save is given in %al, so it's theoretically 11895 // possible to do an indirect jump trick to avoid saving all of them, 11896 // however this code takes a simpler approach and just executes all 11897 // of the stores if %al is non-zero. It's less code, and it's probably 11898 // easier on the hardware branch predictor, and stores aren't all that 11899 // expensive anyway. 11900 11901 // Create the new basic blocks. One block contains all the XMM stores, 11902 // and one block is the final destination regardless of whether any 11903 // stores were performed. 11904 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 11905 MachineFunction *F = MBB->getParent(); 11906 MachineFunction::iterator MBBIter = MBB; 11907 ++MBBIter; 11908 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 11909 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 11910 F->insert(MBBIter, XMMSaveMBB); 11911 F->insert(MBBIter, EndMBB); 11912 11913 // Transfer the remainder of MBB and its successor edges to EndMBB. 11914 EndMBB->splice(EndMBB->begin(), MBB, 11915 llvm::next(MachineBasicBlock::iterator(MI)), 11916 MBB->end()); 11917 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 11918 11919 // The original block will now fall through to the XMM save block. 11920 MBB->addSuccessor(XMMSaveMBB); 11921 // The XMMSaveMBB will fall through to the end block. 11922 XMMSaveMBB->addSuccessor(EndMBB); 11923 11924 // Now add the instructions. 11925 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11926 DebugLoc DL = MI->getDebugLoc(); 11927 11928 unsigned CountReg = MI->getOperand(0).getReg(); 11929 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 11930 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 11931 11932 if (!Subtarget->isTargetWin64()) { 11933 // If %al is 0, branch around the XMM save block. 11934 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 11935 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 11936 MBB->addSuccessor(EndMBB); 11937 } 11938 11939 unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; 11940 // In the XMM save block, save all the XMM argument registers. 11941 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 11942 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 11943 MachineMemOperand *MMO = 11944 F->getMachineMemOperand( 11945 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 11946 MachineMemOperand::MOStore, 11947 /*Size=*/16, /*Align=*/16); 11948 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 11949 .addFrameIndex(RegSaveFrameIndex) 11950 .addImm(/*Scale=*/1) 11951 .addReg(/*IndexReg=*/0) 11952 .addImm(/*Disp=*/Offset) 11953 .addReg(/*Segment=*/0) 11954 .addReg(MI->getOperand(i).getReg()) 11955 .addMemOperand(MMO); 11956 } 11957 11958 MI->eraseFromParent(); // The pseudo instruction is gone now. 11959 11960 return EndMBB; 11961} 11962 11963MachineBasicBlock * 11964X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 11965 MachineBasicBlock *BB) const { 11966 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11967 DebugLoc DL = MI->getDebugLoc(); 11968 11969 // To "insert" a SELECT_CC instruction, we actually have to insert the 11970 // diamond control-flow pattern. The incoming instruction knows the 11971 // destination vreg to set, the condition code register to branch on, the 11972 // true/false values to select between, and a branch opcode to use. 11973 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11974 MachineFunction::iterator It = BB; 11975 ++It; 11976 11977 // thisMBB: 11978 // ... 11979 // TrueVal = ... 11980 // cmpTY ccX, r1, r2 11981 // bCC copy1MBB 11982 // fallthrough --> copy0MBB 11983 MachineBasicBlock *thisMBB = BB; 11984 MachineFunction *F = BB->getParent(); 11985 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 11986 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 11987 F->insert(It, copy0MBB); 11988 F->insert(It, sinkMBB); 11989 11990 // If the EFLAGS register isn't dead in the terminator, then claim that it's 11991 // live into the sink and copy blocks. 11992 if (!MI->killsRegister(X86::EFLAGS)) { 11993 copy0MBB->addLiveIn(X86::EFLAGS); 11994 sinkMBB->addLiveIn(X86::EFLAGS); 11995 } 11996 11997 // Transfer the remainder of BB and its successor edges to sinkMBB. 11998 sinkMBB->splice(sinkMBB->begin(), BB, 11999 llvm::next(MachineBasicBlock::iterator(MI)), 12000 BB->end()); 12001 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 12002 12003 // Add the true and fallthrough blocks as its successors. 12004 BB->addSuccessor(copy0MBB); 12005 BB->addSuccessor(sinkMBB); 12006 12007 // Create the conditional branch instruction. 12008 unsigned Opc = 12009 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 12010 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 12011 12012 // copy0MBB: 12013 // %FalseValue = ... 12014 // # fallthrough to sinkMBB 12015 copy0MBB->addSuccessor(sinkMBB); 12016 12017 // sinkMBB: 12018 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 12019 // ... 12020 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 12021 TII->get(X86::PHI), MI->getOperand(0).getReg()) 12022 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 12023 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 12024 12025 MI->eraseFromParent(); // The pseudo instruction is gone now. 12026 return sinkMBB; 12027} 12028 12029MachineBasicBlock * 12030X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, 12031 bool Is64Bit) const { 12032 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12033 DebugLoc DL = MI->getDebugLoc(); 12034 MachineFunction *MF = BB->getParent(); 12035 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 12036 12037 assert(getTargetMachine().Options.EnableSegmentedStacks); 12038 12039 unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 12040 unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; 12041 12042 // BB: 12043 // ... [Till the alloca] 12044 // If stacklet is not large enough, jump to mallocMBB 12045 // 12046 // bumpMBB: 12047 // Allocate by subtracting from RSP 12048 // Jump to continueMBB 12049 // 12050 // mallocMBB: 12051 // Allocate by call to runtime 12052 // 12053 // continueMBB: 12054 // ... 12055 // [rest of original BB] 12056 // 12057 12058 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12059 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12060 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 12061 12062 MachineRegisterInfo &MRI = MF->getRegInfo(); 12063 const TargetRegisterClass *AddrRegClass = 12064 getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); 12065 12066 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 12067 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 12068 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 12069 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 12070 sizeVReg = MI->getOperand(1).getReg(), 12071 physSPReg = Is64Bit ? X86::RSP : X86::ESP; 12072 12073 MachineFunction::iterator MBBIter = BB; 12074 ++MBBIter; 12075 12076 MF->insert(MBBIter, bumpMBB); 12077 MF->insert(MBBIter, mallocMBB); 12078 MF->insert(MBBIter, continueMBB); 12079 12080 continueMBB->splice(continueMBB->begin(), BB, llvm::next 12081 (MachineBasicBlock::iterator(MI)), BB->end()); 12082 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 12083 12084 // Add code to the main basic block to check if the stack limit has been hit, 12085 // and if so, jump to mallocMBB otherwise to bumpMBB. 12086 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 12087 BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 12088 .addReg(tmpSPVReg).addReg(sizeVReg); 12089 BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) 12090 .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg) 12091 .addReg(SPLimitVReg); 12092 BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); 12093 12094 // bumpMBB simply decreases the stack pointer, since we know the current 12095 // stacklet has enough space. 12096 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 12097 .addReg(SPLimitVReg); 12098 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 12099 .addReg(SPLimitVReg); 12100 BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 12101 12102 // Calls into a routine in libgcc to allocate more space from the heap. 12103 if (Is64Bit) { 12104 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 12105 .addReg(sizeVReg); 12106 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 12107 .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI); 12108 } else { 12109 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 12110 .addImm(12); 12111 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 12112 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 12113 .addExternalSymbol("__morestack_allocate_stack_space"); 12114 } 12115 12116 if (!Is64Bit) 12117 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 12118 .addImm(16); 12119 12120 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 12121 .addReg(Is64Bit ? X86::RAX : X86::EAX); 12122 BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 12123 12124 // Set up the CFG correctly. 12125 BB->addSuccessor(bumpMBB); 12126 BB->addSuccessor(mallocMBB); 12127 mallocMBB->addSuccessor(continueMBB); 12128 bumpMBB->addSuccessor(continueMBB); 12129 12130 // Take care of the PHI nodes. 12131 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 12132 MI->getOperand(0).getReg()) 12133 .addReg(mallocPtrVReg).addMBB(mallocMBB) 12134 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 12135 12136 // Delete the original pseudo instruction. 12137 MI->eraseFromParent(); 12138 12139 // And we're done. 12140 return continueMBB; 12141} 12142 12143MachineBasicBlock * 12144X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 12145 MachineBasicBlock *BB) const { 12146 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12147 DebugLoc DL = MI->getDebugLoc(); 12148 12149 assert(!Subtarget->isTargetEnvMacho()); 12150 12151 // The lowering is pretty easy: we're just emitting the call to _alloca. The 12152 // non-trivial part is impdef of ESP. 12153 12154 if (Subtarget->isTargetWin64()) { 12155 if (Subtarget->isTargetCygMing()) { 12156 // ___chkstk(Mingw64): 12157 // Clobbers R10, R11, RAX and EFLAGS. 12158 // Updates RSP. 12159 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 12160 .addExternalSymbol("___chkstk") 12161 .addReg(X86::RAX, RegState::Implicit) 12162 .addReg(X86::RSP, RegState::Implicit) 12163 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 12164 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 12165 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 12166 } else { 12167 // __chkstk(MSVCRT): does not update stack pointer. 12168 // Clobbers R10, R11 and EFLAGS. 12169 // FIXME: RAX(allocated size) might be reused and not killed. 12170 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 12171 .addExternalSymbol("__chkstk") 12172 .addReg(X86::RAX, RegState::Implicit) 12173 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 12174 // RAX has the offset to subtracted from RSP. 12175 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 12176 .addReg(X86::RSP) 12177 .addReg(X86::RAX); 12178 } 12179 } else { 12180 const char *StackProbeSymbol = 12181 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 12182 12183 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 12184 .addExternalSymbol(StackProbeSymbol) 12185 .addReg(X86::EAX, RegState::Implicit) 12186 .addReg(X86::ESP, RegState::Implicit) 12187 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 12188 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 12189 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 12190 } 12191 12192 MI->eraseFromParent(); // The pseudo instruction is gone now. 12193 return BB; 12194} 12195 12196MachineBasicBlock * 12197X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 12198 MachineBasicBlock *BB) const { 12199 // This is pretty easy. We're taking the value that we received from 12200 // our load from the relocation, sticking it in either RDI (x86-64) 12201 // or EAX and doing an indirect call. The return value will then 12202 // be in the normal return register. 12203 const X86InstrInfo *TII 12204 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 12205 DebugLoc DL = MI->getDebugLoc(); 12206 MachineFunction *F = BB->getParent(); 12207 12208 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 12209 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 12210 12211 if (Subtarget->is64Bit()) { 12212 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 12213 TII->get(X86::MOV64rm), X86::RDI) 12214 .addReg(X86::RIP) 12215 .addImm(0).addReg(0) 12216 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 12217 MI->getOperand(3).getTargetFlags()) 12218 .addReg(0); 12219 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 12220 addDirectMem(MIB, X86::RDI); 12221 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 12222 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 12223 TII->get(X86::MOV32rm), X86::EAX) 12224 .addReg(0) 12225 .addImm(0).addReg(0) 12226 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 12227 MI->getOperand(3).getTargetFlags()) 12228 .addReg(0); 12229 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 12230 addDirectMem(MIB, X86::EAX); 12231 } else { 12232 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 12233 TII->get(X86::MOV32rm), X86::EAX) 12234 .addReg(TII->getGlobalBaseReg(F)) 12235 .addImm(0).addReg(0) 12236 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 12237 MI->getOperand(3).getTargetFlags()) 12238 .addReg(0); 12239 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 12240 addDirectMem(MIB, X86::EAX); 12241 } 12242 12243 MI->eraseFromParent(); // The pseudo instruction is gone now. 12244 return BB; 12245} 12246 12247MachineBasicBlock * 12248X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 12249 MachineBasicBlock *BB) const { 12250 switch (MI->getOpcode()) { 12251 default: assert(0 && "Unexpected instr type to insert"); 12252 case X86::TAILJMPd64: 12253 case X86::TAILJMPr64: 12254 case X86::TAILJMPm64: 12255 assert(0 && "TAILJMP64 would not be touched here."); 12256 case X86::TCRETURNdi64: 12257 case X86::TCRETURNri64: 12258 case X86::TCRETURNmi64: 12259 // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. 12260 // On AMD64, additional defs should be added before register allocation. 12261 if (!Subtarget->isTargetWin64()) { 12262 MI->addRegisterDefined(X86::RSI); 12263 MI->addRegisterDefined(X86::RDI); 12264 MI->addRegisterDefined(X86::XMM6); 12265 MI->addRegisterDefined(X86::XMM7); 12266 MI->addRegisterDefined(X86::XMM8); 12267 MI->addRegisterDefined(X86::XMM9); 12268 MI->addRegisterDefined(X86::XMM10); 12269 MI->addRegisterDefined(X86::XMM11); 12270 MI->addRegisterDefined(X86::XMM12); 12271 MI->addRegisterDefined(X86::XMM13); 12272 MI->addRegisterDefined(X86::XMM14); 12273 MI->addRegisterDefined(X86::XMM15); 12274 } 12275 return BB; 12276 case X86::WIN_ALLOCA: 12277 return EmitLoweredWinAlloca(MI, BB); 12278 case X86::SEG_ALLOCA_32: 12279 return EmitLoweredSegAlloca(MI, BB, false); 12280 case X86::SEG_ALLOCA_64: 12281 return EmitLoweredSegAlloca(MI, BB, true); 12282 case X86::TLSCall_32: 12283 case X86::TLSCall_64: 12284 return EmitLoweredTLSCall(MI, BB); 12285 case X86::CMOV_GR8: 12286 case X86::CMOV_FR32: 12287 case X86::CMOV_FR64: 12288 case X86::CMOV_V4F32: 12289 case X86::CMOV_V2F64: 12290 case X86::CMOV_V2I64: 12291 case X86::CMOV_V8F32: 12292 case X86::CMOV_V4F64: 12293 case X86::CMOV_V4I64: 12294 case X86::CMOV_GR16: 12295 case X86::CMOV_GR32: 12296 case X86::CMOV_RFP32: 12297 case X86::CMOV_RFP64: 12298 case X86::CMOV_RFP80: 12299 return EmitLoweredSelect(MI, BB); 12300 12301 case X86::FP32_TO_INT16_IN_MEM: 12302 case X86::FP32_TO_INT32_IN_MEM: 12303 case X86::FP32_TO_INT64_IN_MEM: 12304 case X86::FP64_TO_INT16_IN_MEM: 12305 case X86::FP64_TO_INT32_IN_MEM: 12306 case X86::FP64_TO_INT64_IN_MEM: 12307 case X86::FP80_TO_INT16_IN_MEM: 12308 case X86::FP80_TO_INT32_IN_MEM: 12309 case X86::FP80_TO_INT64_IN_MEM: { 12310 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 12311 DebugLoc DL = MI->getDebugLoc(); 12312 12313 // Change the floating point control register to use "round towards zero" 12314 // mode when truncating to an integer value. 12315 MachineFunction *F = BB->getParent(); 12316 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 12317 addFrameReference(BuildMI(*BB, MI, DL, 12318 TII->get(X86::FNSTCW16m)), CWFrameIdx); 12319 12320 // Load the old value of the high byte of the control word... 12321 unsigned OldCW = 12322 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 12323 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 12324 CWFrameIdx); 12325 12326 // Set the high part to be round to zero... 12327 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 12328 .addImm(0xC7F); 12329 12330 // Reload the modified control word now... 12331 addFrameReference(BuildMI(*BB, MI, DL, 12332 TII->get(X86::FLDCW16m)), CWFrameIdx); 12333 12334 // Restore the memory image of control word to original value 12335 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 12336 .addReg(OldCW); 12337 12338 // Get the X86 opcode to use. 12339 unsigned Opc; 12340 switch (MI->getOpcode()) { 12341 default: llvm_unreachable("illegal opcode!"); 12342 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 12343 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 12344 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 12345 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 12346 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 12347 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 12348 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 12349 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 12350 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 12351 } 12352 12353 X86AddressMode AM; 12354 MachineOperand &Op = MI->getOperand(0); 12355 if (Op.isReg()) { 12356 AM.BaseType = X86AddressMode::RegBase; 12357 AM.Base.Reg = Op.getReg(); 12358 } else { 12359 AM.BaseType = X86AddressMode::FrameIndexBase; 12360 AM.Base.FrameIndex = Op.getIndex(); 12361 } 12362 Op = MI->getOperand(1); 12363 if (Op.isImm()) 12364 AM.Scale = Op.getImm(); 12365 Op = MI->getOperand(2); 12366 if (Op.isImm()) 12367 AM.IndexReg = Op.getImm(); 12368 Op = MI->getOperand(3); 12369 if (Op.isGlobal()) { 12370 AM.GV = Op.getGlobal(); 12371 } else { 12372 AM.Disp = Op.getImm(); 12373 } 12374 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 12375 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 12376 12377 // Reload the original control word now. 12378 addFrameReference(BuildMI(*BB, MI, DL, 12379 TII->get(X86::FLDCW16m)), CWFrameIdx); 12380 12381 MI->eraseFromParent(); // The pseudo instruction is gone now. 12382 return BB; 12383 } 12384 // String/text processing lowering. 12385 case X86::PCMPISTRM128REG: 12386 case X86::VPCMPISTRM128REG: 12387 return EmitPCMP(MI, BB, 3, false /* in-mem */); 12388 case X86::PCMPISTRM128MEM: 12389 case X86::VPCMPISTRM128MEM: 12390 return EmitPCMP(MI, BB, 3, true /* in-mem */); 12391 case X86::PCMPESTRM128REG: 12392 case X86::VPCMPESTRM128REG: 12393 return EmitPCMP(MI, BB, 5, false /* in mem */); 12394 case X86::PCMPESTRM128MEM: 12395 case X86::VPCMPESTRM128MEM: 12396 return EmitPCMP(MI, BB, 5, true /* in mem */); 12397 12398 // Thread synchronization. 12399 case X86::MONITOR: 12400 return EmitMonitor(MI, BB); 12401 case X86::MWAIT: 12402 return EmitMwait(MI, BB); 12403 12404 // Atomic Lowering. 12405 case X86::ATOMAND32: 12406 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 12407 X86::AND32ri, X86::MOV32rm, 12408 X86::LCMPXCHG32, 12409 X86::NOT32r, X86::EAX, 12410 X86::GR32RegisterClass); 12411 case X86::ATOMOR32: 12412 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 12413 X86::OR32ri, X86::MOV32rm, 12414 X86::LCMPXCHG32, 12415 X86::NOT32r, X86::EAX, 12416 X86::GR32RegisterClass); 12417 case X86::ATOMXOR32: 12418 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 12419 X86::XOR32ri, X86::MOV32rm, 12420 X86::LCMPXCHG32, 12421 X86::NOT32r, X86::EAX, 12422 X86::GR32RegisterClass); 12423 case X86::ATOMNAND32: 12424 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 12425 X86::AND32ri, X86::MOV32rm, 12426 X86::LCMPXCHG32, 12427 X86::NOT32r, X86::EAX, 12428 X86::GR32RegisterClass, true); 12429 case X86::ATOMMIN32: 12430 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 12431 case X86::ATOMMAX32: 12432 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 12433 case X86::ATOMUMIN32: 12434 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 12435 case X86::ATOMUMAX32: 12436 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 12437 12438 case X86::ATOMAND16: 12439 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 12440 X86::AND16ri, X86::MOV16rm, 12441 X86::LCMPXCHG16, 12442 X86::NOT16r, X86::AX, 12443 X86::GR16RegisterClass); 12444 case X86::ATOMOR16: 12445 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 12446 X86::OR16ri, X86::MOV16rm, 12447 X86::LCMPXCHG16, 12448 X86::NOT16r, X86::AX, 12449 X86::GR16RegisterClass); 12450 case X86::ATOMXOR16: 12451 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 12452 X86::XOR16ri, X86::MOV16rm, 12453 X86::LCMPXCHG16, 12454 X86::NOT16r, X86::AX, 12455 X86::GR16RegisterClass); 12456 case X86::ATOMNAND16: 12457 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 12458 X86::AND16ri, X86::MOV16rm, 12459 X86::LCMPXCHG16, 12460 X86::NOT16r, X86::AX, 12461 X86::GR16RegisterClass, true); 12462 case X86::ATOMMIN16: 12463 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 12464 case X86::ATOMMAX16: 12465 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 12466 case X86::ATOMUMIN16: 12467 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 12468 case X86::ATOMUMAX16: 12469 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 12470 12471 case X86::ATOMAND8: 12472 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 12473 X86::AND8ri, X86::MOV8rm, 12474 X86::LCMPXCHG8, 12475 X86::NOT8r, X86::AL, 12476 X86::GR8RegisterClass); 12477 case X86::ATOMOR8: 12478 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 12479 X86::OR8ri, X86::MOV8rm, 12480 X86::LCMPXCHG8, 12481 X86::NOT8r, X86::AL, 12482 X86::GR8RegisterClass); 12483 case X86::ATOMXOR8: 12484 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 12485 X86::XOR8ri, X86::MOV8rm, 12486 X86::LCMPXCHG8, 12487 X86::NOT8r, X86::AL, 12488 X86::GR8RegisterClass); 12489 case X86::ATOMNAND8: 12490 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 12491 X86::AND8ri, X86::MOV8rm, 12492 X86::LCMPXCHG8, 12493 X86::NOT8r, X86::AL, 12494 X86::GR8RegisterClass, true); 12495 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 12496 // This group is for 64-bit host. 12497 case X86::ATOMAND64: 12498 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 12499 X86::AND64ri32, X86::MOV64rm, 12500 X86::LCMPXCHG64, 12501 X86::NOT64r, X86::RAX, 12502 X86::GR64RegisterClass); 12503 case X86::ATOMOR64: 12504 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 12505 X86::OR64ri32, X86::MOV64rm, 12506 X86::LCMPXCHG64, 12507 X86::NOT64r, X86::RAX, 12508 X86::GR64RegisterClass); 12509 case X86::ATOMXOR64: 12510 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 12511 X86::XOR64ri32, X86::MOV64rm, 12512 X86::LCMPXCHG64, 12513 X86::NOT64r, X86::RAX, 12514 X86::GR64RegisterClass); 12515 case X86::ATOMNAND64: 12516 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 12517 X86::AND64ri32, X86::MOV64rm, 12518 X86::LCMPXCHG64, 12519 X86::NOT64r, X86::RAX, 12520 X86::GR64RegisterClass, true); 12521 case X86::ATOMMIN64: 12522 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 12523 case X86::ATOMMAX64: 12524 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 12525 case X86::ATOMUMIN64: 12526 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 12527 case X86::ATOMUMAX64: 12528 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 12529 12530 // This group does 64-bit operations on a 32-bit host. 12531 case X86::ATOMAND6432: 12532 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12533 X86::AND32rr, X86::AND32rr, 12534 X86::AND32ri, X86::AND32ri, 12535 false); 12536 case X86::ATOMOR6432: 12537 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12538 X86::OR32rr, X86::OR32rr, 12539 X86::OR32ri, X86::OR32ri, 12540 false); 12541 case X86::ATOMXOR6432: 12542 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12543 X86::XOR32rr, X86::XOR32rr, 12544 X86::XOR32ri, X86::XOR32ri, 12545 false); 12546 case X86::ATOMNAND6432: 12547 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12548 X86::AND32rr, X86::AND32rr, 12549 X86::AND32ri, X86::AND32ri, 12550 true); 12551 case X86::ATOMADD6432: 12552 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12553 X86::ADD32rr, X86::ADC32rr, 12554 X86::ADD32ri, X86::ADC32ri, 12555 false); 12556 case X86::ATOMSUB6432: 12557 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12558 X86::SUB32rr, X86::SBB32rr, 12559 X86::SUB32ri, X86::SBB32ri, 12560 false); 12561 case X86::ATOMSWAP6432: 12562 return EmitAtomicBit6432WithCustomInserter(MI, BB, 12563 X86::MOV32rr, X86::MOV32rr, 12564 X86::MOV32ri, X86::MOV32ri, 12565 false); 12566 case X86::VASTART_SAVE_XMM_REGS: 12567 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 12568 12569 case X86::VAARG_64: 12570 return EmitVAARG64WithCustomInserter(MI, BB); 12571 } 12572} 12573 12574//===----------------------------------------------------------------------===// 12575// X86 Optimization Hooks 12576//===----------------------------------------------------------------------===// 12577 12578void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 12579 const APInt &Mask, 12580 APInt &KnownZero, 12581 APInt &KnownOne, 12582 const SelectionDAG &DAG, 12583 unsigned Depth) const { 12584 unsigned Opc = Op.getOpcode(); 12585 assert((Opc >= ISD::BUILTIN_OP_END || 12586 Opc == ISD::INTRINSIC_WO_CHAIN || 12587 Opc == ISD::INTRINSIC_W_CHAIN || 12588 Opc == ISD::INTRINSIC_VOID) && 12589 "Should use MaskedValueIsZero if you don't know whether Op" 12590 " is a target node!"); 12591 12592 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 12593 switch (Opc) { 12594 default: break; 12595 case X86ISD::ADD: 12596 case X86ISD::SUB: 12597 case X86ISD::ADC: 12598 case X86ISD::SBB: 12599 case X86ISD::SMUL: 12600 case X86ISD::UMUL: 12601 case X86ISD::INC: 12602 case X86ISD::DEC: 12603 case X86ISD::OR: 12604 case X86ISD::XOR: 12605 case X86ISD::AND: 12606 // These nodes' second result is a boolean. 12607 if (Op.getResNo() == 0) 12608 break; 12609 // Fallthrough 12610 case X86ISD::SETCC: 12611 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 12612 Mask.getBitWidth() - 1); 12613 break; 12614 case ISD::INTRINSIC_WO_CHAIN: { 12615 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 12616 unsigned NumLoBits = 0; 12617 switch (IntId) { 12618 default: break; 12619 case Intrinsic::x86_sse_movmsk_ps: 12620 case Intrinsic::x86_avx_movmsk_ps_256: 12621 case Intrinsic::x86_sse2_movmsk_pd: 12622 case Intrinsic::x86_avx_movmsk_pd_256: 12623 case Intrinsic::x86_mmx_pmovmskb: 12624 case Intrinsic::x86_sse2_pmovmskb_128: { 12625 // High bits of movmskp{s|d}, pmovmskb are known zero. 12626 switch (IntId) { 12627 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 12628 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 12629 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 12630 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 12631 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 12632 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 12633 } 12634 KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(), 12635 Mask.getBitWidth() - NumLoBits); 12636 break; 12637 } 12638 } 12639 break; 12640 } 12641 } 12642} 12643 12644unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 12645 unsigned Depth) const { 12646 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 12647 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 12648 return Op.getValueType().getScalarType().getSizeInBits(); 12649 12650 // Fallback case. 12651 return 1; 12652} 12653 12654/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 12655/// node is a GlobalAddress + offset. 12656bool X86TargetLowering::isGAPlusOffset(SDNode *N, 12657 const GlobalValue* &GA, 12658 int64_t &Offset) const { 12659 if (N->getOpcode() == X86ISD::Wrapper) { 12660 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 12661 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 12662 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 12663 return true; 12664 } 12665 } 12666 return TargetLowering::isGAPlusOffset(N, GA, Offset); 12667} 12668 12669/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 12670/// same as extracting the high 128-bit part of 256-bit vector and then 12671/// inserting the result into the low part of a new 256-bit vector 12672static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 12673 EVT VT = SVOp->getValueType(0); 12674 int NumElems = VT.getVectorNumElements(); 12675 12676 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 12677 for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j) 12678 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 12679 SVOp->getMaskElt(j) >= 0) 12680 return false; 12681 12682 return true; 12683} 12684 12685/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 12686/// same as extracting the low 128-bit part of 256-bit vector and then 12687/// inserting the result into the high part of a new 256-bit vector 12688static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 12689 EVT VT = SVOp->getValueType(0); 12690 int NumElems = VT.getVectorNumElements(); 12691 12692 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 12693 for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j) 12694 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 12695 SVOp->getMaskElt(j) >= 0) 12696 return false; 12697 12698 return true; 12699} 12700 12701/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 12702static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 12703 TargetLowering::DAGCombinerInfo &DCI) { 12704 DebugLoc dl = N->getDebugLoc(); 12705 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 12706 SDValue V1 = SVOp->getOperand(0); 12707 SDValue V2 = SVOp->getOperand(1); 12708 EVT VT = SVOp->getValueType(0); 12709 int NumElems = VT.getVectorNumElements(); 12710 12711 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 12712 V2.getOpcode() == ISD::CONCAT_VECTORS) { 12713 // 12714 // 0,0,0,... 12715 // | 12716 // V UNDEF BUILD_VECTOR UNDEF 12717 // \ / \ / 12718 // CONCAT_VECTOR CONCAT_VECTOR 12719 // \ / 12720 // \ / 12721 // RESULT: V + zero extended 12722 // 12723 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 12724 V2.getOperand(1).getOpcode() != ISD::UNDEF || 12725 V1.getOperand(1).getOpcode() != ISD::UNDEF) 12726 return SDValue(); 12727 12728 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 12729 return SDValue(); 12730 12731 // To match the shuffle mask, the first half of the mask should 12732 // be exactly the first vector, and all the rest a splat with the 12733 // first element of the second one. 12734 for (int i = 0; i < NumElems/2; ++i) 12735 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 12736 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 12737 return SDValue(); 12738 12739 // Emit a zeroed vector and insert the desired subvector on its 12740 // first half. 12741 SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); 12742 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 12743 DAG.getConstant(0, MVT::i32), DAG, dl); 12744 return DCI.CombineTo(N, InsV); 12745 } 12746 12747 //===--------------------------------------------------------------------===// 12748 // Combine some shuffles into subvector extracts and inserts: 12749 // 12750 12751 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 12752 if (isShuffleHigh128VectorInsertLow(SVOp)) { 12753 SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32), 12754 DAG, dl); 12755 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), 12756 V, DAG.getConstant(0, MVT::i32), DAG, dl); 12757 return DCI.CombineTo(N, InsV); 12758 } 12759 12760 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 12761 if (isShuffleLow128VectorInsertHigh(SVOp)) { 12762 SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl); 12763 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), 12764 V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 12765 return DCI.CombineTo(N, InsV); 12766 } 12767 12768 return SDValue(); 12769} 12770 12771/// PerformShuffleCombine - Performs several different shuffle combines. 12772static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 12773 TargetLowering::DAGCombinerInfo &DCI, 12774 const X86Subtarget *Subtarget) { 12775 DebugLoc dl = N->getDebugLoc(); 12776 EVT VT = N->getValueType(0); 12777 12778 // Don't create instructions with illegal types after legalize types has run. 12779 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12780 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 12781 return SDValue(); 12782 12783 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 12784 if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && 12785 N->getOpcode() == ISD::VECTOR_SHUFFLE) 12786 return PerformShuffleCombine256(N, DAG, DCI); 12787 12788 // Only handle 128 wide vector from here on. 12789 if (VT.getSizeInBits() != 128) 12790 return SDValue(); 12791 12792 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 12793 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 12794 // consecutive, non-overlapping, and in the right order. 12795 SmallVector<SDValue, 16> Elts; 12796 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 12797 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 12798 12799 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 12800} 12801 12802/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 12803/// generation and convert it from being a bunch of shuffles and extracts 12804/// to a simple store and scalar loads to extract the elements. 12805static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 12806 const TargetLowering &TLI) { 12807 SDValue InputVector = N->getOperand(0); 12808 12809 // Only operate on vectors of 4 elements, where the alternative shuffling 12810 // gets to be more expensive. 12811 if (InputVector.getValueType() != MVT::v4i32) 12812 return SDValue(); 12813 12814 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 12815 // single use which is a sign-extend or zero-extend, and all elements are 12816 // used. 12817 SmallVector<SDNode *, 4> Uses; 12818 unsigned ExtractedElements = 0; 12819 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 12820 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 12821 if (UI.getUse().getResNo() != InputVector.getResNo()) 12822 return SDValue(); 12823 12824 SDNode *Extract = *UI; 12825 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12826 return SDValue(); 12827 12828 if (Extract->getValueType(0) != MVT::i32) 12829 return SDValue(); 12830 if (!Extract->hasOneUse()) 12831 return SDValue(); 12832 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 12833 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 12834 return SDValue(); 12835 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 12836 return SDValue(); 12837 12838 // Record which element was extracted. 12839 ExtractedElements |= 12840 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 12841 12842 Uses.push_back(Extract); 12843 } 12844 12845 // If not all the elements were used, this may not be worthwhile. 12846 if (ExtractedElements != 15) 12847 return SDValue(); 12848 12849 // Ok, we've now decided to do the transformation. 12850 DebugLoc dl = InputVector.getDebugLoc(); 12851 12852 // Store the value to a temporary stack slot. 12853 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 12854 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 12855 MachinePointerInfo(), false, false, 0); 12856 12857 // Replace each use (extract) with a load of the appropriate element. 12858 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 12859 UE = Uses.end(); UI != UE; ++UI) { 12860 SDNode *Extract = *UI; 12861 12862 // cOMpute the element's address. 12863 SDValue Idx = Extract->getOperand(1); 12864 unsigned EltSize = 12865 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 12866 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 12867 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 12868 12869 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 12870 StackPtr, OffsetVal); 12871 12872 // Load the scalar. 12873 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 12874 ScalarAddr, MachinePointerInfo(), 12875 false, false, false, 0); 12876 12877 // Replace the exact with the load. 12878 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 12879 } 12880 12881 // The replacement was made in place; don't return anything. 12882 return SDValue(); 12883} 12884 12885/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 12886/// nodes. 12887static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 12888 const X86Subtarget *Subtarget) { 12889 DebugLoc DL = N->getDebugLoc(); 12890 SDValue Cond = N->getOperand(0); 12891 // Get the LHS/RHS of the select. 12892 SDValue LHS = N->getOperand(1); 12893 SDValue RHS = N->getOperand(2); 12894 EVT VT = LHS.getValueType(); 12895 12896 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 12897 // instructions match the semantics of the common C idiom x<y?x:y but not 12898 // x<=y?x:y, because of how they handle negative zero (which can be 12899 // ignored in unsafe-math mode). 12900 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 12901 VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 12902 (Subtarget->hasXMMInt() || 12903 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 12904 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 12905 12906 unsigned Opcode = 0; 12907 // Check for x CC y ? x : y. 12908 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 12909 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 12910 switch (CC) { 12911 default: break; 12912 case ISD::SETULT: 12913 // Converting this to a min would handle NaNs incorrectly, and swapping 12914 // the operands would cause it to handle comparisons between positive 12915 // and negative zero incorrectly. 12916 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 12917 if (!DAG.getTarget().Options.UnsafeFPMath && 12918 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 12919 break; 12920 std::swap(LHS, RHS); 12921 } 12922 Opcode = X86ISD::FMIN; 12923 break; 12924 case ISD::SETOLE: 12925 // Converting this to a min would handle comparisons between positive 12926 // and negative zero incorrectly. 12927 if (!DAG.getTarget().Options.UnsafeFPMath && 12928 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 12929 break; 12930 Opcode = X86ISD::FMIN; 12931 break; 12932 case ISD::SETULE: 12933 // Converting this to a min would handle both negative zeros and NaNs 12934 // incorrectly, but we can swap the operands to fix both. 12935 std::swap(LHS, RHS); 12936 case ISD::SETOLT: 12937 case ISD::SETLT: 12938 case ISD::SETLE: 12939 Opcode = X86ISD::FMIN; 12940 break; 12941 12942 case ISD::SETOGE: 12943 // Converting this to a max would handle comparisons between positive 12944 // and negative zero incorrectly. 12945 if (!DAG.getTarget().Options.UnsafeFPMath && 12946 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 12947 break; 12948 Opcode = X86ISD::FMAX; 12949 break; 12950 case ISD::SETUGT: 12951 // Converting this to a max would handle NaNs incorrectly, and swapping 12952 // the operands would cause it to handle comparisons between positive 12953 // and negative zero incorrectly. 12954 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 12955 if (!DAG.getTarget().Options.UnsafeFPMath && 12956 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 12957 break; 12958 std::swap(LHS, RHS); 12959 } 12960 Opcode = X86ISD::FMAX; 12961 break; 12962 case ISD::SETUGE: 12963 // Converting this to a max would handle both negative zeros and NaNs 12964 // incorrectly, but we can swap the operands to fix both. 12965 std::swap(LHS, RHS); 12966 case ISD::SETOGT: 12967 case ISD::SETGT: 12968 case ISD::SETGE: 12969 Opcode = X86ISD::FMAX; 12970 break; 12971 } 12972 // Check for x CC y ? y : x -- a min/max with reversed arms. 12973 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 12974 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 12975 switch (CC) { 12976 default: break; 12977 case ISD::SETOGE: 12978 // Converting this to a min would handle comparisons between positive 12979 // and negative zero incorrectly, and swapping the operands would 12980 // cause it to handle NaNs incorrectly. 12981 if (!DAG.getTarget().Options.UnsafeFPMath && 12982 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 12983 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 12984 break; 12985 std::swap(LHS, RHS); 12986 } 12987 Opcode = X86ISD::FMIN; 12988 break; 12989 case ISD::SETUGT: 12990 // Converting this to a min would handle NaNs incorrectly. 12991 if (!DAG.getTarget().Options.UnsafeFPMath && 12992 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 12993 break; 12994 Opcode = X86ISD::FMIN; 12995 break; 12996 case ISD::SETUGE: 12997 // Converting this to a min would handle both negative zeros and NaNs 12998 // incorrectly, but we can swap the operands to fix both. 12999 std::swap(LHS, RHS); 13000 case ISD::SETOGT: 13001 case ISD::SETGT: 13002 case ISD::SETGE: 13003 Opcode = X86ISD::FMIN; 13004 break; 13005 13006 case ISD::SETULT: 13007 // Converting this to a max would handle NaNs incorrectly. 13008 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 13009 break; 13010 Opcode = X86ISD::FMAX; 13011 break; 13012 case ISD::SETOLE: 13013 // Converting this to a max would handle comparisons between positive 13014 // and negative zero incorrectly, and swapping the operands would 13015 // cause it to handle NaNs incorrectly. 13016 if (!DAG.getTarget().Options.UnsafeFPMath && 13017 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 13018 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 13019 break; 13020 std::swap(LHS, RHS); 13021 } 13022 Opcode = X86ISD::FMAX; 13023 break; 13024 case ISD::SETULE: 13025 // Converting this to a max would handle both negative zeros and NaNs 13026 // incorrectly, but we can swap the operands to fix both. 13027 std::swap(LHS, RHS); 13028 case ISD::SETOLT: 13029 case ISD::SETLT: 13030 case ISD::SETLE: 13031 Opcode = X86ISD::FMAX; 13032 break; 13033 } 13034 } 13035 13036 if (Opcode) 13037 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 13038 } 13039 13040 // If this is a select between two integer constants, try to do some 13041 // optimizations. 13042 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 13043 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 13044 // Don't do this for crazy integer types. 13045 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 13046 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 13047 // so that TrueC (the true value) is larger than FalseC. 13048 bool NeedsCondInvert = false; 13049 13050 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 13051 // Efficiently invertible. 13052 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 13053 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 13054 isa<ConstantSDNode>(Cond.getOperand(1))))) { 13055 NeedsCondInvert = true; 13056 std::swap(TrueC, FalseC); 13057 } 13058 13059 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 13060 if (FalseC->getAPIntValue() == 0 && 13061 TrueC->getAPIntValue().isPowerOf2()) { 13062 if (NeedsCondInvert) // Invert the condition if needed. 13063 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 13064 DAG.getConstant(1, Cond.getValueType())); 13065 13066 // Zero extend the condition if needed. 13067 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 13068 13069 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 13070 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 13071 DAG.getConstant(ShAmt, MVT::i8)); 13072 } 13073 13074 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 13075 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 13076 if (NeedsCondInvert) // Invert the condition if needed. 13077 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 13078 DAG.getConstant(1, Cond.getValueType())); 13079 13080 // Zero extend the condition if needed. 13081 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 13082 FalseC->getValueType(0), Cond); 13083 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 13084 SDValue(FalseC, 0)); 13085 } 13086 13087 // Optimize cases that will turn into an LEA instruction. This requires 13088 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 13089 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 13090 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 13091 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 13092 13093 bool isFastMultiplier = false; 13094 if (Diff < 10) { 13095 switch ((unsigned char)Diff) { 13096 default: break; 13097 case 1: // result = add base, cond 13098 case 2: // result = lea base( , cond*2) 13099 case 3: // result = lea base(cond, cond*2) 13100 case 4: // result = lea base( , cond*4) 13101 case 5: // result = lea base(cond, cond*4) 13102 case 8: // result = lea base( , cond*8) 13103 case 9: // result = lea base(cond, cond*8) 13104 isFastMultiplier = true; 13105 break; 13106 } 13107 } 13108 13109 if (isFastMultiplier) { 13110 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 13111 if (NeedsCondInvert) // Invert the condition if needed. 13112 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 13113 DAG.getConstant(1, Cond.getValueType())); 13114 13115 // Zero extend the condition if needed. 13116 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 13117 Cond); 13118 // Scale the condition by the difference. 13119 if (Diff != 1) 13120 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 13121 DAG.getConstant(Diff, Cond.getValueType())); 13122 13123 // Add the base if non-zero. 13124 if (FalseC->getAPIntValue() != 0) 13125 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 13126 SDValue(FalseC, 0)); 13127 return Cond; 13128 } 13129 } 13130 } 13131 } 13132 13133 return SDValue(); 13134} 13135 13136/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 13137static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 13138 TargetLowering::DAGCombinerInfo &DCI) { 13139 DebugLoc DL = N->getDebugLoc(); 13140 13141 // If the flag operand isn't dead, don't touch this CMOV. 13142 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 13143 return SDValue(); 13144 13145 SDValue FalseOp = N->getOperand(0); 13146 SDValue TrueOp = N->getOperand(1); 13147 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 13148 SDValue Cond = N->getOperand(3); 13149 if (CC == X86::COND_E || CC == X86::COND_NE) { 13150 switch (Cond.getOpcode()) { 13151 default: break; 13152 case X86ISD::BSR: 13153 case X86ISD::BSF: 13154 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 13155 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 13156 return (CC == X86::COND_E) ? FalseOp : TrueOp; 13157 } 13158 } 13159 13160 // If this is a select between two integer constants, try to do some 13161 // optimizations. Note that the operands are ordered the opposite of SELECT 13162 // operands. 13163 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 13164 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 13165 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 13166 // larger than FalseC (the false value). 13167 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 13168 CC = X86::GetOppositeBranchCondition(CC); 13169 std::swap(TrueC, FalseC); 13170 } 13171 13172 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 13173 // This is efficient for any integer data type (including i8/i16) and 13174 // shift amount. 13175 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 13176 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 13177 DAG.getConstant(CC, MVT::i8), Cond); 13178 13179 // Zero extend the condition if needed. 13180 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 13181 13182 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 13183 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 13184 DAG.getConstant(ShAmt, MVT::i8)); 13185 if (N->getNumValues() == 2) // Dead flag value? 13186 return DCI.CombineTo(N, Cond, SDValue()); 13187 return Cond; 13188 } 13189 13190 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 13191 // for any integer data type, including i8/i16. 13192 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 13193 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 13194 DAG.getConstant(CC, MVT::i8), Cond); 13195 13196 // Zero extend the condition if needed. 13197 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 13198 FalseC->getValueType(0), Cond); 13199 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 13200 SDValue(FalseC, 0)); 13201 13202 if (N->getNumValues() == 2) // Dead flag value? 13203 return DCI.CombineTo(N, Cond, SDValue()); 13204 return Cond; 13205 } 13206 13207 // Optimize cases that will turn into an LEA instruction. This requires 13208 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 13209 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 13210 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 13211 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 13212 13213 bool isFastMultiplier = false; 13214 if (Diff < 10) { 13215 switch ((unsigned char)Diff) { 13216 default: break; 13217 case 1: // result = add base, cond 13218 case 2: // result = lea base( , cond*2) 13219 case 3: // result = lea base(cond, cond*2) 13220 case 4: // result = lea base( , cond*4) 13221 case 5: // result = lea base(cond, cond*4) 13222 case 8: // result = lea base( , cond*8) 13223 case 9: // result = lea base(cond, cond*8) 13224 isFastMultiplier = true; 13225 break; 13226 } 13227 } 13228 13229 if (isFastMultiplier) { 13230 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 13231 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 13232 DAG.getConstant(CC, MVT::i8), Cond); 13233 // Zero extend the condition if needed. 13234 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 13235 Cond); 13236 // Scale the condition by the difference. 13237 if (Diff != 1) 13238 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 13239 DAG.getConstant(Diff, Cond.getValueType())); 13240 13241 // Add the base if non-zero. 13242 if (FalseC->getAPIntValue() != 0) 13243 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 13244 SDValue(FalseC, 0)); 13245 if (N->getNumValues() == 2) // Dead flag value? 13246 return DCI.CombineTo(N, Cond, SDValue()); 13247 return Cond; 13248 } 13249 } 13250 } 13251 } 13252 return SDValue(); 13253} 13254 13255 13256/// PerformMulCombine - Optimize a single multiply with constant into two 13257/// in order to implement it with two cheaper instructions, e.g. 13258/// LEA + SHL, LEA + LEA. 13259static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 13260 TargetLowering::DAGCombinerInfo &DCI) { 13261 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13262 return SDValue(); 13263 13264 EVT VT = N->getValueType(0); 13265 if (VT != MVT::i64) 13266 return SDValue(); 13267 13268 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13269 if (!C) 13270 return SDValue(); 13271 uint64_t MulAmt = C->getZExtValue(); 13272 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 13273 return SDValue(); 13274 13275 uint64_t MulAmt1 = 0; 13276 uint64_t MulAmt2 = 0; 13277 if ((MulAmt % 9) == 0) { 13278 MulAmt1 = 9; 13279 MulAmt2 = MulAmt / 9; 13280 } else if ((MulAmt % 5) == 0) { 13281 MulAmt1 = 5; 13282 MulAmt2 = MulAmt / 5; 13283 } else if ((MulAmt % 3) == 0) { 13284 MulAmt1 = 3; 13285 MulAmt2 = MulAmt / 3; 13286 } 13287 if (MulAmt2 && 13288 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 13289 DebugLoc DL = N->getDebugLoc(); 13290 13291 if (isPowerOf2_64(MulAmt2) && 13292 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 13293 // If second multiplifer is pow2, issue it first. We want the multiply by 13294 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 13295 // is an add. 13296 std::swap(MulAmt1, MulAmt2); 13297 13298 SDValue NewMul; 13299 if (isPowerOf2_64(MulAmt1)) 13300 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 13301 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 13302 else 13303 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 13304 DAG.getConstant(MulAmt1, VT)); 13305 13306 if (isPowerOf2_64(MulAmt2)) 13307 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 13308 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 13309 else 13310 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 13311 DAG.getConstant(MulAmt2, VT)); 13312 13313 // Do not add new nodes to DAG combiner worklist. 13314 DCI.CombineTo(N, NewMul, false); 13315 } 13316 return SDValue(); 13317} 13318 13319static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 13320 SDValue N0 = N->getOperand(0); 13321 SDValue N1 = N->getOperand(1); 13322 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 13323 EVT VT = N0.getValueType(); 13324 13325 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 13326 // since the result of setcc_c is all zero's or all ones. 13327 if (VT.isInteger() && !VT.isVector() && 13328 N1C && N0.getOpcode() == ISD::AND && 13329 N0.getOperand(1).getOpcode() == ISD::Constant) { 13330 SDValue N00 = N0.getOperand(0); 13331 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 13332 ((N00.getOpcode() == ISD::ANY_EXTEND || 13333 N00.getOpcode() == ISD::ZERO_EXTEND) && 13334 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 13335 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 13336 APInt ShAmt = N1C->getAPIntValue(); 13337 Mask = Mask.shl(ShAmt); 13338 if (Mask != 0) 13339 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 13340 N00, DAG.getConstant(Mask, VT)); 13341 } 13342 } 13343 13344 13345 // Hardware support for vector shifts is sparse which makes us scalarize the 13346 // vector operations in many cases. Also, on sandybridge ADD is faster than 13347 // shl. 13348 // (shl V, 1) -> add V,V 13349 if (isSplatVector(N1.getNode())) { 13350 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 13351 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); 13352 // We shift all of the values by one. In many cases we do not have 13353 // hardware support for this operation. This is better expressed as an ADD 13354 // of two values. 13355 if (N1C && (1 == N1C->getZExtValue())) { 13356 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); 13357 } 13358 } 13359 13360 return SDValue(); 13361} 13362 13363/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 13364/// when possible. 13365static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 13366 const X86Subtarget *Subtarget) { 13367 EVT VT = N->getValueType(0); 13368 if (N->getOpcode() == ISD::SHL) { 13369 SDValue V = PerformSHLCombine(N, DAG); 13370 if (V.getNode()) return V; 13371 } 13372 13373 // On X86 with SSE2 support, we can transform this to a vector shift if 13374 // all elements are shifted by the same amount. We can't do this in legalize 13375 // because the a constant vector is typically transformed to a constant pool 13376 // so we have no knowledge of the shift amount. 13377 if (!Subtarget->hasXMMInt()) 13378 return SDValue(); 13379 13380 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 13381 (!Subtarget->hasAVX2() || 13382 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 13383 return SDValue(); 13384 13385 SDValue ShAmtOp = N->getOperand(1); 13386 EVT EltVT = VT.getVectorElementType(); 13387 DebugLoc DL = N->getDebugLoc(); 13388 SDValue BaseShAmt = SDValue(); 13389 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 13390 unsigned NumElts = VT.getVectorNumElements(); 13391 unsigned i = 0; 13392 for (; i != NumElts; ++i) { 13393 SDValue Arg = ShAmtOp.getOperand(i); 13394 if (Arg.getOpcode() == ISD::UNDEF) continue; 13395 BaseShAmt = Arg; 13396 break; 13397 } 13398 for (; i != NumElts; ++i) { 13399 SDValue Arg = ShAmtOp.getOperand(i); 13400 if (Arg.getOpcode() == ISD::UNDEF) continue; 13401 if (Arg != BaseShAmt) { 13402 return SDValue(); 13403 } 13404 } 13405 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 13406 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 13407 SDValue InVec = ShAmtOp.getOperand(0); 13408 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 13409 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 13410 unsigned i = 0; 13411 for (; i != NumElts; ++i) { 13412 SDValue Arg = InVec.getOperand(i); 13413 if (Arg.getOpcode() == ISD::UNDEF) continue; 13414 BaseShAmt = Arg; 13415 break; 13416 } 13417 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 13418 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 13419 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 13420 if (C->getZExtValue() == SplatIdx) 13421 BaseShAmt = InVec.getOperand(1); 13422 } 13423 } 13424 if (BaseShAmt.getNode() == 0) 13425 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 13426 DAG.getIntPtrConstant(0)); 13427 } else 13428 return SDValue(); 13429 13430 // The shift amount is an i32. 13431 if (EltVT.bitsGT(MVT::i32)) 13432 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 13433 else if (EltVT.bitsLT(MVT::i32)) 13434 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 13435 13436 // The shift amount is identical so we can do a vector shift. 13437 SDValue ValOp = N->getOperand(0); 13438 switch (N->getOpcode()) { 13439 default: 13440 llvm_unreachable("Unknown shift opcode!"); 13441 break; 13442 case ISD::SHL: 13443 if (VT == MVT::v2i64) 13444 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13445 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 13446 ValOp, BaseShAmt); 13447 if (VT == MVT::v4i32) 13448 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13449 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 13450 ValOp, BaseShAmt); 13451 if (VT == MVT::v8i16) 13452 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13453 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 13454 ValOp, BaseShAmt); 13455 if (VT == MVT::v4i64) 13456 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13457 DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32), 13458 ValOp, BaseShAmt); 13459 if (VT == MVT::v8i32) 13460 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13461 DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32), 13462 ValOp, BaseShAmt); 13463 if (VT == MVT::v16i16) 13464 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13465 DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32), 13466 ValOp, BaseShAmt); 13467 break; 13468 case ISD::SRA: 13469 if (VT == MVT::v4i32) 13470 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13471 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 13472 ValOp, BaseShAmt); 13473 if (VT == MVT::v8i16) 13474 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13475 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 13476 ValOp, BaseShAmt); 13477 if (VT == MVT::v8i32) 13478 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13479 DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32), 13480 ValOp, BaseShAmt); 13481 if (VT == MVT::v16i16) 13482 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13483 DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32), 13484 ValOp, BaseShAmt); 13485 break; 13486 case ISD::SRL: 13487 if (VT == MVT::v2i64) 13488 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13489 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 13490 ValOp, BaseShAmt); 13491 if (VT == MVT::v4i32) 13492 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13493 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 13494 ValOp, BaseShAmt); 13495 if (VT == MVT::v8i16) 13496 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13497 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 13498 ValOp, BaseShAmt); 13499 if (VT == MVT::v4i64) 13500 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13501 DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32), 13502 ValOp, BaseShAmt); 13503 if (VT == MVT::v8i32) 13504 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13505 DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32), 13506 ValOp, BaseShAmt); 13507 if (VT == MVT::v16i16) 13508 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13509 DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32), 13510 ValOp, BaseShAmt); 13511 break; 13512 } 13513 return SDValue(); 13514} 13515 13516 13517// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 13518// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 13519// and friends. Likewise for OR -> CMPNEQSS. 13520static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 13521 TargetLowering::DAGCombinerInfo &DCI, 13522 const X86Subtarget *Subtarget) { 13523 unsigned opcode; 13524 13525 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 13526 // we're requiring SSE2 for both. 13527 if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 13528 SDValue N0 = N->getOperand(0); 13529 SDValue N1 = N->getOperand(1); 13530 SDValue CMP0 = N0->getOperand(1); 13531 SDValue CMP1 = N1->getOperand(1); 13532 DebugLoc DL = N->getDebugLoc(); 13533 13534 // The SETCCs should both refer to the same CMP. 13535 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 13536 return SDValue(); 13537 13538 SDValue CMP00 = CMP0->getOperand(0); 13539 SDValue CMP01 = CMP0->getOperand(1); 13540 EVT VT = CMP00.getValueType(); 13541 13542 if (VT == MVT::f32 || VT == MVT::f64) { 13543 bool ExpectingFlags = false; 13544 // Check for any users that want flags: 13545 for (SDNode::use_iterator UI = N->use_begin(), 13546 UE = N->use_end(); 13547 !ExpectingFlags && UI != UE; ++UI) 13548 switch (UI->getOpcode()) { 13549 default: 13550 case ISD::BR_CC: 13551 case ISD::BRCOND: 13552 case ISD::SELECT: 13553 ExpectingFlags = true; 13554 break; 13555 case ISD::CopyToReg: 13556 case ISD::SIGN_EXTEND: 13557 case ISD::ZERO_EXTEND: 13558 case ISD::ANY_EXTEND: 13559 break; 13560 } 13561 13562 if (!ExpectingFlags) { 13563 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 13564 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 13565 13566 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 13567 X86::CondCode tmp = cc0; 13568 cc0 = cc1; 13569 cc1 = tmp; 13570 } 13571 13572 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 13573 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 13574 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 13575 X86ISD::NodeType NTOperator = is64BitFP ? 13576 X86ISD::FSETCCsd : X86ISD::FSETCCss; 13577 // FIXME: need symbolic constants for these magic numbers. 13578 // See X86ATTInstPrinter.cpp:printSSECC(). 13579 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 13580 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 13581 DAG.getConstant(x86cc, MVT::i8)); 13582 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 13583 OnesOrZeroesF); 13584 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 13585 DAG.getConstant(1, MVT::i32)); 13586 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 13587 return OneBitOfTruth; 13588 } 13589 } 13590 } 13591 } 13592 return SDValue(); 13593} 13594 13595/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 13596/// so it can be folded inside ANDNP. 13597static bool CanFoldXORWithAllOnes(const SDNode *N) { 13598 EVT VT = N->getValueType(0); 13599 13600 // Match direct AllOnes for 128 and 256-bit vectors 13601 if (ISD::isBuildVectorAllOnes(N)) 13602 return true; 13603 13604 // Look through a bit convert. 13605 if (N->getOpcode() == ISD::BITCAST) 13606 N = N->getOperand(0).getNode(); 13607 13608 // Sometimes the operand may come from a insert_subvector building a 256-bit 13609 // allones vector 13610 if (VT.getSizeInBits() == 256 && 13611 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 13612 SDValue V1 = N->getOperand(0); 13613 SDValue V2 = N->getOperand(1); 13614 13615 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 13616 V1.getOperand(0).getOpcode() == ISD::UNDEF && 13617 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 13618 ISD::isBuildVectorAllOnes(V2.getNode())) 13619 return true; 13620 } 13621 13622 return false; 13623} 13624 13625static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 13626 TargetLowering::DAGCombinerInfo &DCI, 13627 const X86Subtarget *Subtarget) { 13628 if (DCI.isBeforeLegalizeOps()) 13629 return SDValue(); 13630 13631 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 13632 if (R.getNode()) 13633 return R; 13634 13635 EVT VT = N->getValueType(0); 13636 13637 // Create ANDN, BLSI, and BLSR instructions 13638 // BLSI is X & (-X) 13639 // BLSR is X & (X-1) 13640 if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { 13641 SDValue N0 = N->getOperand(0); 13642 SDValue N1 = N->getOperand(1); 13643 DebugLoc DL = N->getDebugLoc(); 13644 13645 // Check LHS for not 13646 if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1))) 13647 return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1); 13648 // Check RHS for not 13649 if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1))) 13650 return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0); 13651 13652 // Check LHS for neg 13653 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && 13654 isZero(N0.getOperand(0))) 13655 return DAG.getNode(X86ISD::BLSI, DL, VT, N1); 13656 13657 // Check RHS for neg 13658 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && 13659 isZero(N1.getOperand(0))) 13660 return DAG.getNode(X86ISD::BLSI, DL, VT, N0); 13661 13662 // Check LHS for X-1 13663 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 13664 isAllOnes(N0.getOperand(1))) 13665 return DAG.getNode(X86ISD::BLSR, DL, VT, N1); 13666 13667 // Check RHS for X-1 13668 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 13669 isAllOnes(N1.getOperand(1))) 13670 return DAG.getNode(X86ISD::BLSR, DL, VT, N0); 13671 13672 return SDValue(); 13673 } 13674 13675 // Want to form ANDNP nodes: 13676 // 1) In the hopes of then easily combining them with OR and AND nodes 13677 // to form PBLEND/PSIGN. 13678 // 2) To match ANDN packed intrinsics 13679 if (VT != MVT::v2i64 && VT != MVT::v4i64) 13680 return SDValue(); 13681 13682 SDValue N0 = N->getOperand(0); 13683 SDValue N1 = N->getOperand(1); 13684 DebugLoc DL = N->getDebugLoc(); 13685 13686 // Check LHS for vnot 13687 if (N0.getOpcode() == ISD::XOR && 13688 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 13689 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 13690 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 13691 13692 // Check RHS for vnot 13693 if (N1.getOpcode() == ISD::XOR && 13694 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 13695 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 13696 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 13697 13698 return SDValue(); 13699} 13700 13701static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 13702 TargetLowering::DAGCombinerInfo &DCI, 13703 const X86Subtarget *Subtarget) { 13704 if (DCI.isBeforeLegalizeOps()) 13705 return SDValue(); 13706 13707 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 13708 if (R.getNode()) 13709 return R; 13710 13711 EVT VT = N->getValueType(0); 13712 13713 SDValue N0 = N->getOperand(0); 13714 SDValue N1 = N->getOperand(1); 13715 13716 // look for psign/blend 13717 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 13718 if (!Subtarget->hasSSSE3orAVX() || 13719 (VT == MVT::v4i64 && !Subtarget->hasAVX2())) 13720 return SDValue(); 13721 13722 // Canonicalize pandn to RHS 13723 if (N0.getOpcode() == X86ISD::ANDNP) 13724 std::swap(N0, N1); 13725 // or (and (m, x), (pandn m, y)) 13726 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 13727 SDValue Mask = N1.getOperand(0); 13728 SDValue X = N1.getOperand(1); 13729 SDValue Y; 13730 if (N0.getOperand(0) == Mask) 13731 Y = N0.getOperand(1); 13732 if (N0.getOperand(1) == Mask) 13733 Y = N0.getOperand(0); 13734 13735 // Check to see if the mask appeared in both the AND and ANDNP and 13736 if (!Y.getNode()) 13737 return SDValue(); 13738 13739 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 13740 if (Mask.getOpcode() != ISD::BITCAST || 13741 X.getOpcode() != ISD::BITCAST || 13742 Y.getOpcode() != ISD::BITCAST) 13743 return SDValue(); 13744 13745 // Look through mask bitcast. 13746 Mask = Mask.getOperand(0); 13747 EVT MaskVT = Mask.getValueType(); 13748 13749 // Validate that the Mask operand is a vector sra node. The sra node 13750 // will be an intrinsic. 13751 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 13752 return SDValue(); 13753 13754 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 13755 // there is no psrai.b 13756 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 13757 case Intrinsic::x86_sse2_psrai_w: 13758 case Intrinsic::x86_sse2_psrai_d: 13759 case Intrinsic::x86_avx2_psrai_w: 13760 case Intrinsic::x86_avx2_psrai_d: 13761 break; 13762 default: return SDValue(); 13763 } 13764 13765 // Check that the SRA is all signbits. 13766 SDValue SraC = Mask.getOperand(2); 13767 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 13768 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 13769 if ((SraAmt + 1) != EltBits) 13770 return SDValue(); 13771 13772 DebugLoc DL = N->getDebugLoc(); 13773 13774 // Now we know we at least have a plendvb with the mask val. See if 13775 // we can form a psignb/w/d. 13776 // psign = x.type == y.type == mask.type && y = sub(0, x); 13777 X = X.getOperand(0); 13778 Y = Y.getOperand(0); 13779 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 13780 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 13781 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType() && 13782 (EltBits == 8 || EltBits == 16 || EltBits == 32)) { 13783 SDValue Sign = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, 13784 Mask.getOperand(1)); 13785 return DAG.getNode(ISD::BITCAST, DL, VT, Sign); 13786 } 13787 // PBLENDVB only available on SSE 4.1 13788 if (!Subtarget->hasSSE41orAVX()) 13789 return SDValue(); 13790 13791 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 13792 13793 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); 13794 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); 13795 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); 13796 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 13797 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 13798 } 13799 } 13800 13801 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 13802 return SDValue(); 13803 13804 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 13805 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 13806 std::swap(N0, N1); 13807 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 13808 return SDValue(); 13809 if (!N0.hasOneUse() || !N1.hasOneUse()) 13810 return SDValue(); 13811 13812 SDValue ShAmt0 = N0.getOperand(1); 13813 if (ShAmt0.getValueType() != MVT::i8) 13814 return SDValue(); 13815 SDValue ShAmt1 = N1.getOperand(1); 13816 if (ShAmt1.getValueType() != MVT::i8) 13817 return SDValue(); 13818 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 13819 ShAmt0 = ShAmt0.getOperand(0); 13820 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 13821 ShAmt1 = ShAmt1.getOperand(0); 13822 13823 DebugLoc DL = N->getDebugLoc(); 13824 unsigned Opc = X86ISD::SHLD; 13825 SDValue Op0 = N0.getOperand(0); 13826 SDValue Op1 = N1.getOperand(0); 13827 if (ShAmt0.getOpcode() == ISD::SUB) { 13828 Opc = X86ISD::SHRD; 13829 std::swap(Op0, Op1); 13830 std::swap(ShAmt0, ShAmt1); 13831 } 13832 13833 unsigned Bits = VT.getSizeInBits(); 13834 if (ShAmt1.getOpcode() == ISD::SUB) { 13835 SDValue Sum = ShAmt1.getOperand(0); 13836 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 13837 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 13838 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 13839 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 13840 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 13841 return DAG.getNode(Opc, DL, VT, 13842 Op0, Op1, 13843 DAG.getNode(ISD::TRUNCATE, DL, 13844 MVT::i8, ShAmt0)); 13845 } 13846 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 13847 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 13848 if (ShAmt0C && 13849 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 13850 return DAG.getNode(Opc, DL, VT, 13851 N0.getOperand(0), N1.getOperand(0), 13852 DAG.getNode(ISD::TRUNCATE, DL, 13853 MVT::i8, ShAmt0)); 13854 } 13855 13856 return SDValue(); 13857} 13858 13859static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 13860 TargetLowering::DAGCombinerInfo &DCI, 13861 const X86Subtarget *Subtarget) { 13862 if (DCI.isBeforeLegalizeOps()) 13863 return SDValue(); 13864 13865 EVT VT = N->getValueType(0); 13866 13867 if (VT != MVT::i32 && VT != MVT::i64) 13868 return SDValue(); 13869 13870 // Create BLSMSK instructions by finding X ^ (X-1) 13871 SDValue N0 = N->getOperand(0); 13872 SDValue N1 = N->getOperand(1); 13873 DebugLoc DL = N->getDebugLoc(); 13874 13875 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && 13876 isAllOnes(N0.getOperand(1))) 13877 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); 13878 13879 if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && 13880 isAllOnes(N1.getOperand(1))) 13881 return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); 13882 13883 return SDValue(); 13884} 13885 13886/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 13887static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 13888 const X86Subtarget *Subtarget) { 13889 LoadSDNode *Ld = cast<LoadSDNode>(N); 13890 EVT RegVT = Ld->getValueType(0); 13891 EVT MemVT = Ld->getMemoryVT(); 13892 DebugLoc dl = Ld->getDebugLoc(); 13893 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13894 13895 ISD::LoadExtType Ext = Ld->getExtensionType(); 13896 13897 // If this is a vector EXT Load then attempt to optimize it using a 13898 // shuffle. We need SSE4 for the shuffles. 13899 // TODO: It is possible to support ZExt by zeroing the undef values 13900 // during the shuffle phase or after the shuffle. 13901 if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { 13902 assert(MemVT != RegVT && "Cannot extend to the same type"); 13903 assert(MemVT.isVector() && "Must load a vector from memory"); 13904 13905 unsigned NumElems = RegVT.getVectorNumElements(); 13906 unsigned RegSz = RegVT.getSizeInBits(); 13907 unsigned MemSz = MemVT.getSizeInBits(); 13908 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 13909 // All sizes must be a power of two 13910 if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue(); 13911 13912 // Attempt to load the original value using a single load op. 13913 // Find a scalar type which is equal to the loaded word size. 13914 MVT SclrLoadTy = MVT::i8; 13915 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 13916 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 13917 MVT Tp = (MVT::SimpleValueType)tp; 13918 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) { 13919 SclrLoadTy = Tp; 13920 break; 13921 } 13922 } 13923 13924 // Proceed if a load word is found. 13925 if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue(); 13926 13927 EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, 13928 RegSz/SclrLoadTy.getSizeInBits()); 13929 13930 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 13931 RegSz/MemVT.getScalarType().getSizeInBits()); 13932 // Can't shuffle using an illegal type. 13933 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 13934 13935 // Perform a single load. 13936 SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), 13937 Ld->getBasePtr(), 13938 Ld->getPointerInfo(), Ld->isVolatile(), 13939 Ld->isNonTemporal(), Ld->isInvariant(), 13940 Ld->getAlignment()); 13941 13942 // Insert the word loaded into a vector. 13943 SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 13944 LoadUnitVecVT, ScalarLoad); 13945 13946 // Bitcast the loaded value to a vector of the original element type, in 13947 // the size of the target vector type. 13948 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector); 13949 unsigned SizeRatio = RegSz/MemSz; 13950 13951 // Redistribute the loaded elements into the different locations. 13952 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 13953 for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i; 13954 13955 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 13956 DAG.getUNDEF(SlicedVec.getValueType()), 13957 ShuffleVec.data()); 13958 13959 // Bitcast to the requested type. 13960 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 13961 // Replace the original load with the new sequence 13962 // and return the new chain. 13963 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff); 13964 return SDValue(ScalarLoad.getNode(), 1); 13965 } 13966 13967 return SDValue(); 13968} 13969 13970/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 13971static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 13972 const X86Subtarget *Subtarget) { 13973 StoreSDNode *St = cast<StoreSDNode>(N); 13974 EVT VT = St->getValue().getValueType(); 13975 EVT StVT = St->getMemoryVT(); 13976 DebugLoc dl = St->getDebugLoc(); 13977 SDValue StoredVal = St->getOperand(1); 13978 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13979 13980 // If we are saving a concatenation of two XMM registers, perform two stores. 13981 // This is better in Sandy Bridge cause one 256-bit mem op is done via two 13982 // 128-bit ones. If in the future the cost becomes only one memory access the 13983 // first version would be better. 13984 if (VT.getSizeInBits() == 256 && 13985 StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && 13986 StoredVal.getNumOperands() == 2) { 13987 13988 SDValue Value0 = StoredVal.getOperand(0); 13989 SDValue Value1 = StoredVal.getOperand(1); 13990 13991 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 13992 SDValue Ptr0 = St->getBasePtr(); 13993 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 13994 13995 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 13996 St->getPointerInfo(), St->isVolatile(), 13997 St->isNonTemporal(), St->getAlignment()); 13998 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 13999 St->getPointerInfo(), St->isVolatile(), 14000 St->isNonTemporal(), St->getAlignment()); 14001 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 14002 } 14003 14004 // Optimize trunc store (of multiple scalars) to shuffle and store. 14005 // First, pack all of the elements in one place. Next, store to memory 14006 // in fewer chunks. 14007 if (St->isTruncatingStore() && VT.isVector()) { 14008 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14009 unsigned NumElems = VT.getVectorNumElements(); 14010 assert(StVT != VT && "Cannot truncate to the same type"); 14011 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 14012 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 14013 14014 // From, To sizes and ElemCount must be pow of two 14015 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 14016 // We are going to use the original vector elt for storing. 14017 // Accumulated smaller vector elements must be a multiple of the store size. 14018 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 14019 14020 unsigned SizeRatio = FromSz / ToSz; 14021 14022 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 14023 14024 // Create a type on which we perform the shuffle 14025 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 14026 StVT.getScalarType(), NumElems*SizeRatio); 14027 14028 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 14029 14030 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 14031 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 14032 for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio; 14033 14034 // Can't shuffle using an illegal type 14035 if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); 14036 14037 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 14038 DAG.getUNDEF(WideVec.getValueType()), 14039 ShuffleVec.data()); 14040 // At this point all of the data is stored at the bottom of the 14041 // register. We now need to save it to mem. 14042 14043 // Find the largest store unit 14044 MVT StoreType = MVT::i8; 14045 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 14046 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 14047 MVT Tp = (MVT::SimpleValueType)tp; 14048 if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz) 14049 StoreType = Tp; 14050 } 14051 14052 // Bitcast the original vector into a vector of store-size units 14053 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 14054 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); 14055 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 14056 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 14057 SmallVector<SDValue, 8> Chains; 14058 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 14059 TLI.getPointerTy()); 14060 SDValue Ptr = St->getBasePtr(); 14061 14062 // Perform one or more big stores into memory. 14063 for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) { 14064 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 14065 StoreType, ShuffWide, 14066 DAG.getIntPtrConstant(i)); 14067 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 14068 St->getPointerInfo(), St->isVolatile(), 14069 St->isNonTemporal(), St->getAlignment()); 14070 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 14071 Chains.push_back(Ch); 14072 } 14073 14074 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], 14075 Chains.size()); 14076 } 14077 14078 14079 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 14080 // the FP state in cases where an emms may be missing. 14081 // A preferable solution to the general problem is to figure out the right 14082 // places to insert EMMS. This qualifies as a quick hack. 14083 14084 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 14085 if (VT.getSizeInBits() != 64) 14086 return SDValue(); 14087 14088 const Function *F = DAG.getMachineFunction().getFunction(); 14089 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 14090 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps 14091 && Subtarget->hasXMMInt(); 14092 if ((VT.isVector() || 14093 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 14094 isa<LoadSDNode>(St->getValue()) && 14095 !cast<LoadSDNode>(St->getValue())->isVolatile() && 14096 St->getChain().hasOneUse() && !St->isVolatile()) { 14097 SDNode* LdVal = St->getValue().getNode(); 14098 LoadSDNode *Ld = 0; 14099 int TokenFactorIndex = -1; 14100 SmallVector<SDValue, 8> Ops; 14101 SDNode* ChainVal = St->getChain().getNode(); 14102 // Must be a store of a load. We currently handle two cases: the load 14103 // is a direct child, and it's under an intervening TokenFactor. It is 14104 // possible to dig deeper under nested TokenFactors. 14105 if (ChainVal == LdVal) 14106 Ld = cast<LoadSDNode>(St->getChain()); 14107 else if (St->getValue().hasOneUse() && 14108 ChainVal->getOpcode() == ISD::TokenFactor) { 14109 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 14110 if (ChainVal->getOperand(i).getNode() == LdVal) { 14111 TokenFactorIndex = i; 14112 Ld = cast<LoadSDNode>(St->getValue()); 14113 } else 14114 Ops.push_back(ChainVal->getOperand(i)); 14115 } 14116 } 14117 14118 if (!Ld || !ISD::isNormalLoad(Ld)) 14119 return SDValue(); 14120 14121 // If this is not the MMX case, i.e. we are just turning i64 load/store 14122 // into f64 load/store, avoid the transformation if there are multiple 14123 // uses of the loaded value. 14124 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 14125 return SDValue(); 14126 14127 DebugLoc LdDL = Ld->getDebugLoc(); 14128 DebugLoc StDL = N->getDebugLoc(); 14129 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 14130 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 14131 // pair instead. 14132 if (Subtarget->is64Bit() || F64IsLegal) { 14133 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 14134 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 14135 Ld->getPointerInfo(), Ld->isVolatile(), 14136 Ld->isNonTemporal(), Ld->isInvariant(), 14137 Ld->getAlignment()); 14138 SDValue NewChain = NewLd.getValue(1); 14139 if (TokenFactorIndex != -1) { 14140 Ops.push_back(NewChain); 14141 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 14142 Ops.size()); 14143 } 14144 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 14145 St->getPointerInfo(), 14146 St->isVolatile(), St->isNonTemporal(), 14147 St->getAlignment()); 14148 } 14149 14150 // Otherwise, lower to two pairs of 32-bit loads / stores. 14151 SDValue LoAddr = Ld->getBasePtr(); 14152 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 14153 DAG.getConstant(4, MVT::i32)); 14154 14155 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 14156 Ld->getPointerInfo(), 14157 Ld->isVolatile(), Ld->isNonTemporal(), 14158 Ld->isInvariant(), Ld->getAlignment()); 14159 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 14160 Ld->getPointerInfo().getWithOffset(4), 14161 Ld->isVolatile(), Ld->isNonTemporal(), 14162 Ld->isInvariant(), 14163 MinAlign(Ld->getAlignment(), 4)); 14164 14165 SDValue NewChain = LoLd.getValue(1); 14166 if (TokenFactorIndex != -1) { 14167 Ops.push_back(LoLd); 14168 Ops.push_back(HiLd); 14169 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 14170 Ops.size()); 14171 } 14172 14173 LoAddr = St->getBasePtr(); 14174 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 14175 DAG.getConstant(4, MVT::i32)); 14176 14177 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 14178 St->getPointerInfo(), 14179 St->isVolatile(), St->isNonTemporal(), 14180 St->getAlignment()); 14181 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 14182 St->getPointerInfo().getWithOffset(4), 14183 St->isVolatile(), 14184 St->isNonTemporal(), 14185 MinAlign(St->getAlignment(), 4)); 14186 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 14187 } 14188 return SDValue(); 14189} 14190 14191/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" 14192/// and return the operands for the horizontal operation in LHS and RHS. A 14193/// horizontal operation performs the binary operation on successive elements 14194/// of its first operand, then on successive elements of its second operand, 14195/// returning the resulting values in a vector. For example, if 14196/// A = < float a0, float a1, float a2, float a3 > 14197/// and 14198/// B = < float b0, float b1, float b2, float b3 > 14199/// then the result of doing a horizontal operation on A and B is 14200/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 14201/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 14202/// A horizontal-op B, for some already available A and B, and if so then LHS is 14203/// set to A, RHS to B, and the routine returns 'true'. 14204/// Note that the binary operation should have the property that if one of the 14205/// operands is UNDEF then the result is UNDEF. 14206static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 14207 // Look for the following pattern: if 14208 // A = < float a0, float a1, float a2, float a3 > 14209 // B = < float b0, float b1, float b2, float b3 > 14210 // and 14211 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 14212 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 14213 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 14214 // which is A horizontal-op B. 14215 14216 // At least one of the operands should be a vector shuffle. 14217 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 14218 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 14219 return false; 14220 14221 EVT VT = LHS.getValueType(); 14222 14223 assert((VT.is128BitVector() || VT.is256BitVector()) && 14224 "Unsupported vector type for horizontal add/sub"); 14225 14226 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 14227 // operate independently on 128-bit lanes. 14228 unsigned NumElts = VT.getVectorNumElements(); 14229 unsigned NumLanes = VT.getSizeInBits()/128; 14230 unsigned NumLaneElts = NumElts / NumLanes; 14231 assert((NumLaneElts % 2 == 0) && 14232 "Vector type should have an even number of elements in each lane"); 14233 unsigned HalfLaneElts = NumLaneElts/2; 14234 14235 // View LHS in the form 14236 // LHS = VECTOR_SHUFFLE A, B, LMask 14237 // If LHS is not a shuffle then pretend it is the shuffle 14238 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 14239 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 14240 // type VT. 14241 SDValue A, B; 14242 SmallVector<int, 16> LMask(NumElts); 14243 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 14244 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 14245 A = LHS.getOperand(0); 14246 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 14247 B = LHS.getOperand(1); 14248 cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask); 14249 } else { 14250 if (LHS.getOpcode() != ISD::UNDEF) 14251 A = LHS; 14252 for (unsigned i = 0; i != NumElts; ++i) 14253 LMask[i] = i; 14254 } 14255 14256 // Likewise, view RHS in the form 14257 // RHS = VECTOR_SHUFFLE C, D, RMask 14258 SDValue C, D; 14259 SmallVector<int, 16> RMask(NumElts); 14260 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 14261 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 14262 C = RHS.getOperand(0); 14263 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 14264 D = RHS.getOperand(1); 14265 cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask); 14266 } else { 14267 if (RHS.getOpcode() != ISD::UNDEF) 14268 C = RHS; 14269 for (unsigned i = 0; i != NumElts; ++i) 14270 RMask[i] = i; 14271 } 14272 14273 // Check that the shuffles are both shuffling the same vectors. 14274 if (!(A == C && B == D) && !(A == D && B == C)) 14275 return false; 14276 14277 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 14278 if (!A.getNode() && !B.getNode()) 14279 return false; 14280 14281 // If A and B occur in reverse order in RHS, then "swap" them (which means 14282 // rewriting the mask). 14283 if (A != C) 14284 CommuteVectorShuffleMask(RMask, NumElts); 14285 14286 // At this point LHS and RHS are equivalent to 14287 // LHS = VECTOR_SHUFFLE A, B, LMask 14288 // RHS = VECTOR_SHUFFLE A, B, RMask 14289 // Check that the masks correspond to performing a horizontal operation. 14290 for (unsigned i = 0; i != NumElts; ++i) { 14291 int LIdx = LMask[i], RIdx = RMask[i]; 14292 14293 // Ignore any UNDEF components. 14294 if (LIdx < 0 || RIdx < 0 || 14295 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 14296 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 14297 continue; 14298 14299 // Check that successive elements are being operated on. If not, this is 14300 // not a horizontal operation. 14301 unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs 14302 unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; 14303 int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; 14304 if (!(LIdx == Index && RIdx == Index + 1) && 14305 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 14306 return false; 14307 } 14308 14309 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 14310 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 14311 return true; 14312} 14313 14314/// PerformFADDCombine - Do target-specific dag combines on floating point adds. 14315static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 14316 const X86Subtarget *Subtarget) { 14317 EVT VT = N->getValueType(0); 14318 SDValue LHS = N->getOperand(0); 14319 SDValue RHS = N->getOperand(1); 14320 14321 // Try to synthesize horizontal adds from adds of shuffles. 14322 if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 14323 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 14324 isHorizontalBinOp(LHS, RHS, true)) 14325 return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); 14326 return SDValue(); 14327} 14328 14329/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. 14330static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 14331 const X86Subtarget *Subtarget) { 14332 EVT VT = N->getValueType(0); 14333 SDValue LHS = N->getOperand(0); 14334 SDValue RHS = N->getOperand(1); 14335 14336 // Try to synthesize horizontal subs from subs of shuffles. 14337 if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 14338 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 14339 isHorizontalBinOp(LHS, RHS, false)) 14340 return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); 14341 return SDValue(); 14342} 14343 14344/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 14345/// X86ISD::FXOR nodes. 14346static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 14347 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 14348 // F[X]OR(0.0, x) -> x 14349 // F[X]OR(x, 0.0) -> x 14350 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 14351 if (C->getValueAPF().isPosZero()) 14352 return N->getOperand(1); 14353 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 14354 if (C->getValueAPF().isPosZero()) 14355 return N->getOperand(0); 14356 return SDValue(); 14357} 14358 14359/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 14360static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 14361 // FAND(0.0, x) -> 0.0 14362 // FAND(x, 0.0) -> 0.0 14363 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 14364 if (C->getValueAPF().isPosZero()) 14365 return N->getOperand(0); 14366 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 14367 if (C->getValueAPF().isPosZero()) 14368 return N->getOperand(1); 14369 return SDValue(); 14370} 14371 14372static SDValue PerformBTCombine(SDNode *N, 14373 SelectionDAG &DAG, 14374 TargetLowering::DAGCombinerInfo &DCI) { 14375 // BT ignores high bits in the bit index operand. 14376 SDValue Op1 = N->getOperand(1); 14377 if (Op1.hasOneUse()) { 14378 unsigned BitWidth = Op1.getValueSizeInBits(); 14379 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 14380 APInt KnownZero, KnownOne; 14381 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 14382 !DCI.isBeforeLegalizeOps()); 14383 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14384 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 14385 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 14386 DCI.CommitTargetLoweringOpt(TLO); 14387 } 14388 return SDValue(); 14389} 14390 14391static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 14392 SDValue Op = N->getOperand(0); 14393 if (Op.getOpcode() == ISD::BITCAST) 14394 Op = Op.getOperand(0); 14395 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 14396 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 14397 VT.getVectorElementType().getSizeInBits() == 14398 OpVT.getVectorElementType().getSizeInBits()) { 14399 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 14400 } 14401 return SDValue(); 14402} 14403 14404static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 14405 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 14406 // (and (i32 x86isd::setcc_carry), 1) 14407 // This eliminates the zext. This transformation is necessary because 14408 // ISD::SETCC is always legalized to i8. 14409 DebugLoc dl = N->getDebugLoc(); 14410 SDValue N0 = N->getOperand(0); 14411 EVT VT = N->getValueType(0); 14412 if (N0.getOpcode() == ISD::AND && 14413 N0.hasOneUse() && 14414 N0.getOperand(0).hasOneUse()) { 14415 SDValue N00 = N0.getOperand(0); 14416 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 14417 return SDValue(); 14418 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 14419 if (!C || C->getZExtValue() != 1) 14420 return SDValue(); 14421 return DAG.getNode(ISD::AND, dl, VT, 14422 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 14423 N00.getOperand(0), N00.getOperand(1)), 14424 DAG.getConstant(1, VT)); 14425 } 14426 14427 return SDValue(); 14428} 14429 14430// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 14431static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 14432 unsigned X86CC = N->getConstantOperandVal(0); 14433 SDValue EFLAG = N->getOperand(1); 14434 DebugLoc DL = N->getDebugLoc(); 14435 14436 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 14437 // a zext and produces an all-ones bit which is more useful than 0/1 in some 14438 // cases. 14439 if (X86CC == X86::COND_B) 14440 return DAG.getNode(ISD::AND, DL, MVT::i8, 14441 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 14442 DAG.getConstant(X86CC, MVT::i8), EFLAG), 14443 DAG.getConstant(1, MVT::i8)); 14444 14445 return SDValue(); 14446} 14447 14448static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 14449 const X86TargetLowering *XTLI) { 14450 SDValue Op0 = N->getOperand(0); 14451 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 14452 // a 32-bit target where SSE doesn't support i64->FP operations. 14453 if (Op0.getOpcode() == ISD::LOAD) { 14454 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 14455 EVT VT = Ld->getValueType(0); 14456 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 14457 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 14458 !XTLI->getSubtarget()->is64Bit() && 14459 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 14460 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 14461 Ld->getChain(), Op0, DAG); 14462 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 14463 return FILDChain; 14464 } 14465 } 14466 return SDValue(); 14467} 14468 14469// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 14470static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 14471 X86TargetLowering::DAGCombinerInfo &DCI) { 14472 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 14473 // the result is either zero or one (depending on the input carry bit). 14474 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 14475 if (X86::isZeroNode(N->getOperand(0)) && 14476 X86::isZeroNode(N->getOperand(1)) && 14477 // We don't have a good way to replace an EFLAGS use, so only do this when 14478 // dead right now. 14479 SDValue(N, 1).use_empty()) { 14480 DebugLoc DL = N->getDebugLoc(); 14481 EVT VT = N->getValueType(0); 14482 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 14483 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 14484 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 14485 DAG.getConstant(X86::COND_B,MVT::i8), 14486 N->getOperand(2)), 14487 DAG.getConstant(1, VT)); 14488 return DCI.CombineTo(N, Res1, CarryOut); 14489 } 14490 14491 return SDValue(); 14492} 14493 14494// fold (add Y, (sete X, 0)) -> adc 0, Y 14495// (add Y, (setne X, 0)) -> sbb -1, Y 14496// (sub (sete X, 0), Y) -> sbb 0, Y 14497// (sub (setne X, 0), Y) -> adc -1, Y 14498static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 14499 DebugLoc DL = N->getDebugLoc(); 14500 14501 // Look through ZExts. 14502 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 14503 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 14504 return SDValue(); 14505 14506 SDValue SetCC = Ext.getOperand(0); 14507 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 14508 return SDValue(); 14509 14510 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 14511 if (CC != X86::COND_E && CC != X86::COND_NE) 14512 return SDValue(); 14513 14514 SDValue Cmp = SetCC.getOperand(1); 14515 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 14516 !X86::isZeroNode(Cmp.getOperand(1)) || 14517 !Cmp.getOperand(0).getValueType().isInteger()) 14518 return SDValue(); 14519 14520 SDValue CmpOp0 = Cmp.getOperand(0); 14521 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 14522 DAG.getConstant(1, CmpOp0.getValueType())); 14523 14524 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 14525 if (CC == X86::COND_NE) 14526 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 14527 DL, OtherVal.getValueType(), OtherVal, 14528 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 14529 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 14530 DL, OtherVal.getValueType(), OtherVal, 14531 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 14532} 14533 14534/// PerformADDCombine - Do target-specific dag combines on integer adds. 14535static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 14536 const X86Subtarget *Subtarget) { 14537 EVT VT = N->getValueType(0); 14538 SDValue Op0 = N->getOperand(0); 14539 SDValue Op1 = N->getOperand(1); 14540 14541 // Try to synthesize horizontal adds from adds of shuffles. 14542 if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 14543 (Subtarget->hasAVX2() && (VT == MVT::v16i16 || MVT::v8i32))) && 14544 isHorizontalBinOp(Op0, Op1, true)) 14545 return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); 14546 14547 return OptimizeConditionalInDecrement(N, DAG); 14548} 14549 14550static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 14551 const X86Subtarget *Subtarget) { 14552 SDValue Op0 = N->getOperand(0); 14553 SDValue Op1 = N->getOperand(1); 14554 14555 // X86 can't encode an immediate LHS of a sub. See if we can push the 14556 // negation into a preceding instruction. 14557 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 14558 // If the RHS of the sub is a XOR with one use and a constant, invert the 14559 // immediate. Then add one to the LHS of the sub so we can turn 14560 // X-Y -> X+~Y+1, saving one register. 14561 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 14562 isa<ConstantSDNode>(Op1.getOperand(1))) { 14563 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 14564 EVT VT = Op0.getValueType(); 14565 SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, 14566 Op1.getOperand(0), 14567 DAG.getConstant(~XorC, VT)); 14568 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, 14569 DAG.getConstant(C->getAPIntValue()+1, VT)); 14570 } 14571 } 14572 14573 // Try to synthesize horizontal adds from adds of shuffles. 14574 EVT VT = N->getValueType(0); 14575 if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 14576 (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 14577 isHorizontalBinOp(Op0, Op1, true)) 14578 return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); 14579 14580 return OptimizeConditionalInDecrement(N, DAG); 14581} 14582 14583SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 14584 DAGCombinerInfo &DCI) const { 14585 SelectionDAG &DAG = DCI.DAG; 14586 switch (N->getOpcode()) { 14587 default: break; 14588 case ISD::EXTRACT_VECTOR_ELT: 14589 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 14590 case ISD::VSELECT: 14591 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 14592 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 14593 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 14594 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 14595 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 14596 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 14597 case ISD::SHL: 14598 case ISD::SRA: 14599 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 14600 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 14601 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 14602 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 14603 case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); 14604 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 14605 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 14606 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 14607 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 14608 case X86ISD::FXOR: 14609 case X86ISD::FOR: return PerformFORCombine(N, DAG); 14610 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 14611 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 14612 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 14613 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 14614 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 14615 case X86ISD::SHUFPS: // Handle all target specific shuffles 14616 case X86ISD::SHUFPD: 14617 case X86ISD::PALIGN: 14618 case X86ISD::UNPCKH: 14619 case X86ISD::UNPCKL: 14620 case X86ISD::MOVHLPS: 14621 case X86ISD::MOVLHPS: 14622 case X86ISD::PSHUFD: 14623 case X86ISD::PSHUFHW: 14624 case X86ISD::PSHUFLW: 14625 case X86ISD::MOVSS: 14626 case X86ISD::MOVSD: 14627 case X86ISD::VPERMILP: 14628 case X86ISD::VPERM2X128: 14629 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 14630 } 14631 14632 return SDValue(); 14633} 14634 14635/// isTypeDesirableForOp - Return true if the target has native support for 14636/// the specified value type and it is 'desirable' to use the type for the 14637/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 14638/// instruction encodings are longer and some i16 instructions are slow. 14639bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 14640 if (!isTypeLegal(VT)) 14641 return false; 14642 if (VT != MVT::i16) 14643 return true; 14644 14645 switch (Opc) { 14646 default: 14647 return true; 14648 case ISD::LOAD: 14649 case ISD::SIGN_EXTEND: 14650 case ISD::ZERO_EXTEND: 14651 case ISD::ANY_EXTEND: 14652 case ISD::SHL: 14653 case ISD::SRL: 14654 case ISD::SUB: 14655 case ISD::ADD: 14656 case ISD::MUL: 14657 case ISD::AND: 14658 case ISD::OR: 14659 case ISD::XOR: 14660 return false; 14661 } 14662} 14663 14664/// IsDesirableToPromoteOp - This method query the target whether it is 14665/// beneficial for dag combiner to promote the specified node. If true, it 14666/// should return the desired promotion type by reference. 14667bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 14668 EVT VT = Op.getValueType(); 14669 if (VT != MVT::i16) 14670 return false; 14671 14672 bool Promote = false; 14673 bool Commute = false; 14674 switch (Op.getOpcode()) { 14675 default: break; 14676 case ISD::LOAD: { 14677 LoadSDNode *LD = cast<LoadSDNode>(Op); 14678 // If the non-extending load has a single use and it's not live out, then it 14679 // might be folded. 14680 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 14681 Op.hasOneUse()*/) { 14682 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 14683 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 14684 // The only case where we'd want to promote LOAD (rather then it being 14685 // promoted as an operand is when it's only use is liveout. 14686 if (UI->getOpcode() != ISD::CopyToReg) 14687 return false; 14688 } 14689 } 14690 Promote = true; 14691 break; 14692 } 14693 case ISD::SIGN_EXTEND: 14694 case ISD::ZERO_EXTEND: 14695 case ISD::ANY_EXTEND: 14696 Promote = true; 14697 break; 14698 case ISD::SHL: 14699 case ISD::SRL: { 14700 SDValue N0 = Op.getOperand(0); 14701 // Look out for (store (shl (load), x)). 14702 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 14703 return false; 14704 Promote = true; 14705 break; 14706 } 14707 case ISD::ADD: 14708 case ISD::MUL: 14709 case ISD::AND: 14710 case ISD::OR: 14711 case ISD::XOR: 14712 Commute = true; 14713 // fallthrough 14714 case ISD::SUB: { 14715 SDValue N0 = Op.getOperand(0); 14716 SDValue N1 = Op.getOperand(1); 14717 if (!Commute && MayFoldLoad(N1)) 14718 return false; 14719 // Avoid disabling potential load folding opportunities. 14720 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 14721 return false; 14722 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 14723 return false; 14724 Promote = true; 14725 } 14726 } 14727 14728 PVT = MVT::i32; 14729 return Promote; 14730} 14731 14732//===----------------------------------------------------------------------===// 14733// X86 Inline Assembly Support 14734//===----------------------------------------------------------------------===// 14735 14736namespace { 14737 // Helper to match a string separated by whitespace. 14738 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { 14739 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. 14740 14741 for (unsigned i = 0, e = args.size(); i != e; ++i) { 14742 StringRef piece(*args[i]); 14743 if (!s.startswith(piece)) // Check if the piece matches. 14744 return false; 14745 14746 s = s.substr(piece.size()); 14747 StringRef::size_type pos = s.find_first_not_of(" \t"); 14748 if (pos == 0) // We matched a prefix. 14749 return false; 14750 14751 s = s.substr(pos); 14752 } 14753 14754 return s.empty(); 14755 } 14756 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; 14757} 14758 14759bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 14760 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 14761 14762 std::string AsmStr = IA->getAsmString(); 14763 14764 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 14765 if (!Ty || Ty->getBitWidth() % 16 != 0) 14766 return false; 14767 14768 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 14769 SmallVector<StringRef, 4> AsmPieces; 14770 SplitString(AsmStr, AsmPieces, ";\n"); 14771 14772 switch (AsmPieces.size()) { 14773 default: return false; 14774 case 1: 14775 // FIXME: this should verify that we are targeting a 486 or better. If not, 14776 // we will turn this bswap into something that will be lowered to logical 14777 // ops instead of emitting the bswap asm. For now, we don't support 486 or 14778 // lower so don't worry about this. 14779 // bswap $0 14780 if (matchAsm(AsmPieces[0], "bswap", "$0") || 14781 matchAsm(AsmPieces[0], "bswapl", "$0") || 14782 matchAsm(AsmPieces[0], "bswapq", "$0") || 14783 matchAsm(AsmPieces[0], "bswap", "${0:q}") || 14784 matchAsm(AsmPieces[0], "bswapl", "${0:q}") || 14785 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { 14786 // No need to check constraints, nothing other than the equivalent of 14787 // "=r,0" would be valid here. 14788 return IntrinsicLowering::LowerToByteSwap(CI); 14789 } 14790 14791 // rorw $$8, ${0:w} --> llvm.bswap.i16 14792 if (CI->getType()->isIntegerTy(16) && 14793 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 14794 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || 14795 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { 14796 AsmPieces.clear(); 14797 const std::string &ConstraintsStr = IA->getConstraintString(); 14798 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 14799 std::sort(AsmPieces.begin(), AsmPieces.end()); 14800 if (AsmPieces.size() == 4 && 14801 AsmPieces[0] == "~{cc}" && 14802 AsmPieces[1] == "~{dirflag}" && 14803 AsmPieces[2] == "~{flags}" && 14804 AsmPieces[3] == "~{fpsr}") 14805 return IntrinsicLowering::LowerToByteSwap(CI); 14806 } 14807 break; 14808 case 3: 14809 if (CI->getType()->isIntegerTy(32) && 14810 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 14811 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && 14812 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && 14813 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { 14814 AsmPieces.clear(); 14815 const std::string &ConstraintsStr = IA->getConstraintString(); 14816 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 14817 std::sort(AsmPieces.begin(), AsmPieces.end()); 14818 if (AsmPieces.size() == 4 && 14819 AsmPieces[0] == "~{cc}" && 14820 AsmPieces[1] == "~{dirflag}" && 14821 AsmPieces[2] == "~{flags}" && 14822 AsmPieces[3] == "~{fpsr}") 14823 return IntrinsicLowering::LowerToByteSwap(CI); 14824 } 14825 14826 if (CI->getType()->isIntegerTy(64)) { 14827 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 14828 if (Constraints.size() >= 2 && 14829 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 14830 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 14831 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 14832 if (matchAsm(AsmPieces[0], "bswap", "%eax") && 14833 matchAsm(AsmPieces[1], "bswap", "%edx") && 14834 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) 14835 return IntrinsicLowering::LowerToByteSwap(CI); 14836 } 14837 } 14838 break; 14839 } 14840 return false; 14841} 14842 14843 14844 14845/// getConstraintType - Given a constraint letter, return the type of 14846/// constraint it is for this target. 14847X86TargetLowering::ConstraintType 14848X86TargetLowering::getConstraintType(const std::string &Constraint) const { 14849 if (Constraint.size() == 1) { 14850 switch (Constraint[0]) { 14851 case 'R': 14852 case 'q': 14853 case 'Q': 14854 case 'f': 14855 case 't': 14856 case 'u': 14857 case 'y': 14858 case 'x': 14859 case 'Y': 14860 case 'l': 14861 return C_RegisterClass; 14862 case 'a': 14863 case 'b': 14864 case 'c': 14865 case 'd': 14866 case 'S': 14867 case 'D': 14868 case 'A': 14869 return C_Register; 14870 case 'I': 14871 case 'J': 14872 case 'K': 14873 case 'L': 14874 case 'M': 14875 case 'N': 14876 case 'G': 14877 case 'C': 14878 case 'e': 14879 case 'Z': 14880 return C_Other; 14881 default: 14882 break; 14883 } 14884 } 14885 return TargetLowering::getConstraintType(Constraint); 14886} 14887 14888/// Examine constraint type and operand type and determine a weight value. 14889/// This object must already have been set up with the operand type 14890/// and the current alternative constraint selected. 14891TargetLowering::ConstraintWeight 14892 X86TargetLowering::getSingleConstraintMatchWeight( 14893 AsmOperandInfo &info, const char *constraint) const { 14894 ConstraintWeight weight = CW_Invalid; 14895 Value *CallOperandVal = info.CallOperandVal; 14896 // If we don't have a value, we can't do a match, 14897 // but allow it at the lowest weight. 14898 if (CallOperandVal == NULL) 14899 return CW_Default; 14900 Type *type = CallOperandVal->getType(); 14901 // Look at the constraint type. 14902 switch (*constraint) { 14903 default: 14904 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 14905 case 'R': 14906 case 'q': 14907 case 'Q': 14908 case 'a': 14909 case 'b': 14910 case 'c': 14911 case 'd': 14912 case 'S': 14913 case 'D': 14914 case 'A': 14915 if (CallOperandVal->getType()->isIntegerTy()) 14916 weight = CW_SpecificReg; 14917 break; 14918 case 'f': 14919 case 't': 14920 case 'u': 14921 if (type->isFloatingPointTy()) 14922 weight = CW_SpecificReg; 14923 break; 14924 case 'y': 14925 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 14926 weight = CW_SpecificReg; 14927 break; 14928 case 'x': 14929 case 'Y': 14930 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 14931 weight = CW_Register; 14932 break; 14933 case 'I': 14934 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 14935 if (C->getZExtValue() <= 31) 14936 weight = CW_Constant; 14937 } 14938 break; 14939 case 'J': 14940 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14941 if (C->getZExtValue() <= 63) 14942 weight = CW_Constant; 14943 } 14944 break; 14945 case 'K': 14946 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14947 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 14948 weight = CW_Constant; 14949 } 14950 break; 14951 case 'L': 14952 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14953 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 14954 weight = CW_Constant; 14955 } 14956 break; 14957 case 'M': 14958 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14959 if (C->getZExtValue() <= 3) 14960 weight = CW_Constant; 14961 } 14962 break; 14963 case 'N': 14964 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14965 if (C->getZExtValue() <= 0xff) 14966 weight = CW_Constant; 14967 } 14968 break; 14969 case 'G': 14970 case 'C': 14971 if (dyn_cast<ConstantFP>(CallOperandVal)) { 14972 weight = CW_Constant; 14973 } 14974 break; 14975 case 'e': 14976 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14977 if ((C->getSExtValue() >= -0x80000000LL) && 14978 (C->getSExtValue() <= 0x7fffffffLL)) 14979 weight = CW_Constant; 14980 } 14981 break; 14982 case 'Z': 14983 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 14984 if (C->getZExtValue() <= 0xffffffff) 14985 weight = CW_Constant; 14986 } 14987 break; 14988 } 14989 return weight; 14990} 14991 14992/// LowerXConstraint - try to replace an X constraint, which matches anything, 14993/// with another that has more specific requirements based on the type of the 14994/// corresponding operand. 14995const char *X86TargetLowering:: 14996LowerXConstraint(EVT ConstraintVT) const { 14997 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 14998 // 'f' like normal targets. 14999 if (ConstraintVT.isFloatingPoint()) { 15000 if (Subtarget->hasXMMInt()) 15001 return "Y"; 15002 if (Subtarget->hasXMM()) 15003 return "x"; 15004 } 15005 15006 return TargetLowering::LowerXConstraint(ConstraintVT); 15007} 15008 15009/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 15010/// vector. If it is invalid, don't add anything to Ops. 15011void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 15012 std::string &Constraint, 15013 std::vector<SDValue>&Ops, 15014 SelectionDAG &DAG) const { 15015 SDValue Result(0, 0); 15016 15017 // Only support length 1 constraints for now. 15018 if (Constraint.length() > 1) return; 15019 15020 char ConstraintLetter = Constraint[0]; 15021 switch (ConstraintLetter) { 15022 default: break; 15023 case 'I': 15024 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15025 if (C->getZExtValue() <= 31) { 15026 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 15027 break; 15028 } 15029 } 15030 return; 15031 case 'J': 15032 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15033 if (C->getZExtValue() <= 63) { 15034 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 15035 break; 15036 } 15037 } 15038 return; 15039 case 'K': 15040 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15041 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 15042 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 15043 break; 15044 } 15045 } 15046 return; 15047 case 'N': 15048 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15049 if (C->getZExtValue() <= 255) { 15050 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 15051 break; 15052 } 15053 } 15054 return; 15055 case 'e': { 15056 // 32-bit signed value 15057 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15058 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 15059 C->getSExtValue())) { 15060 // Widen to 64 bits here to get it sign extended. 15061 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 15062 break; 15063 } 15064 // FIXME gcc accepts some relocatable values here too, but only in certain 15065 // memory models; it's complicated. 15066 } 15067 return; 15068 } 15069 case 'Z': { 15070 // 32-bit unsigned value 15071 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 15072 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 15073 C->getZExtValue())) { 15074 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 15075 break; 15076 } 15077 } 15078 // FIXME gcc accepts some relocatable values here too, but only in certain 15079 // memory models; it's complicated. 15080 return; 15081 } 15082 case 'i': { 15083 // Literal immediates are always ok. 15084 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 15085 // Widen to 64 bits here to get it sign extended. 15086 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 15087 break; 15088 } 15089 15090 // In any sort of PIC mode addresses need to be computed at runtime by 15091 // adding in a register or some sort of table lookup. These can't 15092 // be used as immediates. 15093 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 15094 return; 15095 15096 // If we are in non-pic codegen mode, we allow the address of a global (with 15097 // an optional displacement) to be used with 'i'. 15098 GlobalAddressSDNode *GA = 0; 15099 int64_t Offset = 0; 15100 15101 // Match either (GA), (GA+C), (GA+C1+C2), etc. 15102 while (1) { 15103 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 15104 Offset += GA->getOffset(); 15105 break; 15106 } else if (Op.getOpcode() == ISD::ADD) { 15107 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 15108 Offset += C->getZExtValue(); 15109 Op = Op.getOperand(0); 15110 continue; 15111 } 15112 } else if (Op.getOpcode() == ISD::SUB) { 15113 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 15114 Offset += -C->getZExtValue(); 15115 Op = Op.getOperand(0); 15116 continue; 15117 } 15118 } 15119 15120 // Otherwise, this isn't something we can handle, reject it. 15121 return; 15122 } 15123 15124 const GlobalValue *GV = GA->getGlobal(); 15125 // If we require an extra load to get this address, as in PIC mode, we 15126 // can't accept it. 15127 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 15128 getTargetMachine()))) 15129 return; 15130 15131 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 15132 GA->getValueType(0), Offset); 15133 break; 15134 } 15135 } 15136 15137 if (Result.getNode()) { 15138 Ops.push_back(Result); 15139 return; 15140 } 15141 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 15142} 15143 15144std::pair<unsigned, const TargetRegisterClass*> 15145X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 15146 EVT VT) const { 15147 // First, see if this is a constraint that directly corresponds to an LLVM 15148 // register class. 15149 if (Constraint.size() == 1) { 15150 // GCC Constraint Letters 15151 switch (Constraint[0]) { 15152 default: break; 15153 // TODO: Slight differences here in allocation order and leaving 15154 // RIP in the class. Do they matter any more here than they do 15155 // in the normal allocation? 15156 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 15157 if (Subtarget->is64Bit()) { 15158 if (VT == MVT::i32 || VT == MVT::f32) 15159 return std::make_pair(0U, X86::GR32RegisterClass); 15160 else if (VT == MVT::i16) 15161 return std::make_pair(0U, X86::GR16RegisterClass); 15162 else if (VT == MVT::i8 || VT == MVT::i1) 15163 return std::make_pair(0U, X86::GR8RegisterClass); 15164 else if (VT == MVT::i64 || VT == MVT::f64) 15165 return std::make_pair(0U, X86::GR64RegisterClass); 15166 break; 15167 } 15168 // 32-bit fallthrough 15169 case 'Q': // Q_REGS 15170 if (VT == MVT::i32 || VT == MVT::f32) 15171 return std::make_pair(0U, X86::GR32_ABCDRegisterClass); 15172 else if (VT == MVT::i16) 15173 return std::make_pair(0U, X86::GR16_ABCDRegisterClass); 15174 else if (VT == MVT::i8 || VT == MVT::i1) 15175 return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass); 15176 else if (VT == MVT::i64) 15177 return std::make_pair(0U, X86::GR64_ABCDRegisterClass); 15178 break; 15179 case 'r': // GENERAL_REGS 15180 case 'l': // INDEX_REGS 15181 if (VT == MVT::i8 || VT == MVT::i1) 15182 return std::make_pair(0U, X86::GR8RegisterClass); 15183 if (VT == MVT::i16) 15184 return std::make_pair(0U, X86::GR16RegisterClass); 15185 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 15186 return std::make_pair(0U, X86::GR32RegisterClass); 15187 return std::make_pair(0U, X86::GR64RegisterClass); 15188 case 'R': // LEGACY_REGS 15189 if (VT == MVT::i8 || VT == MVT::i1) 15190 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 15191 if (VT == MVT::i16) 15192 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 15193 if (VT == MVT::i32 || !Subtarget->is64Bit()) 15194 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 15195 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 15196 case 'f': // FP Stack registers. 15197 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 15198 // value to the correct fpstack register class. 15199 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 15200 return std::make_pair(0U, X86::RFP32RegisterClass); 15201 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 15202 return std::make_pair(0U, X86::RFP64RegisterClass); 15203 return std::make_pair(0U, X86::RFP80RegisterClass); 15204 case 'y': // MMX_REGS if MMX allowed. 15205 if (!Subtarget->hasMMX()) break; 15206 return std::make_pair(0U, X86::VR64RegisterClass); 15207 case 'Y': // SSE_REGS if SSE2 allowed 15208 if (!Subtarget->hasXMMInt()) break; 15209 // FALL THROUGH. 15210 case 'x': // SSE_REGS if SSE1 allowed 15211 if (!Subtarget->hasXMM()) break; 15212 15213 switch (VT.getSimpleVT().SimpleTy) { 15214 default: break; 15215 // Scalar SSE types. 15216 case MVT::f32: 15217 case MVT::i32: 15218 return std::make_pair(0U, X86::FR32RegisterClass); 15219 case MVT::f64: 15220 case MVT::i64: 15221 return std::make_pair(0U, X86::FR64RegisterClass); 15222 // Vector types. 15223 case MVT::v16i8: 15224 case MVT::v8i16: 15225 case MVT::v4i32: 15226 case MVT::v2i64: 15227 case MVT::v4f32: 15228 case MVT::v2f64: 15229 return std::make_pair(0U, X86::VR128RegisterClass); 15230 } 15231 break; 15232 } 15233 } 15234 15235 // Use the default implementation in TargetLowering to convert the register 15236 // constraint into a member of a register class. 15237 std::pair<unsigned, const TargetRegisterClass*> Res; 15238 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 15239 15240 // Not found as a standard register? 15241 if (Res.second == 0) { 15242 // Map st(0) -> st(7) -> ST0 15243 if (Constraint.size() == 7 && Constraint[0] == '{' && 15244 tolower(Constraint[1]) == 's' && 15245 tolower(Constraint[2]) == 't' && 15246 Constraint[3] == '(' && 15247 (Constraint[4] >= '0' && Constraint[4] <= '7') && 15248 Constraint[5] == ')' && 15249 Constraint[6] == '}') { 15250 15251 Res.first = X86::ST0+Constraint[4]-'0'; 15252 Res.second = X86::RFP80RegisterClass; 15253 return Res; 15254 } 15255 15256 // GCC allows "st(0)" to be called just plain "st". 15257 if (StringRef("{st}").equals_lower(Constraint)) { 15258 Res.first = X86::ST0; 15259 Res.second = X86::RFP80RegisterClass; 15260 return Res; 15261 } 15262 15263 // flags -> EFLAGS 15264 if (StringRef("{flags}").equals_lower(Constraint)) { 15265 Res.first = X86::EFLAGS; 15266 Res.second = X86::CCRRegisterClass; 15267 return Res; 15268 } 15269 15270 // 'A' means EAX + EDX. 15271 if (Constraint == "A") { 15272 Res.first = X86::EAX; 15273 Res.second = X86::GR32_ADRegisterClass; 15274 return Res; 15275 } 15276 return Res; 15277 } 15278 15279 // Otherwise, check to see if this is a register class of the wrong value 15280 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 15281 // turn into {ax},{dx}. 15282 if (Res.second->hasType(VT)) 15283 return Res; // Correct type already, nothing to do. 15284 15285 // All of the single-register GCC register classes map their values onto 15286 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 15287 // really want an 8-bit or 32-bit register, map to the appropriate register 15288 // class and return the appropriate register. 15289 if (Res.second == X86::GR16RegisterClass) { 15290 if (VT == MVT::i8) { 15291 unsigned DestReg = 0; 15292 switch (Res.first) { 15293 default: break; 15294 case X86::AX: DestReg = X86::AL; break; 15295 case X86::DX: DestReg = X86::DL; break; 15296 case X86::CX: DestReg = X86::CL; break; 15297 case X86::BX: DestReg = X86::BL; break; 15298 } 15299 if (DestReg) { 15300 Res.first = DestReg; 15301 Res.second = X86::GR8RegisterClass; 15302 } 15303 } else if (VT == MVT::i32) { 15304 unsigned DestReg = 0; 15305 switch (Res.first) { 15306 default: break; 15307 case X86::AX: DestReg = X86::EAX; break; 15308 case X86::DX: DestReg = X86::EDX; break; 15309 case X86::CX: DestReg = X86::ECX; break; 15310 case X86::BX: DestReg = X86::EBX; break; 15311 case X86::SI: DestReg = X86::ESI; break; 15312 case X86::DI: DestReg = X86::EDI; break; 15313 case X86::BP: DestReg = X86::EBP; break; 15314 case X86::SP: DestReg = X86::ESP; break; 15315 } 15316 if (DestReg) { 15317 Res.first = DestReg; 15318 Res.second = X86::GR32RegisterClass; 15319 } 15320 } else if (VT == MVT::i64) { 15321 unsigned DestReg = 0; 15322 switch (Res.first) { 15323 default: break; 15324 case X86::AX: DestReg = X86::RAX; break; 15325 case X86::DX: DestReg = X86::RDX; break; 15326 case X86::CX: DestReg = X86::RCX; break; 15327 case X86::BX: DestReg = X86::RBX; break; 15328 case X86::SI: DestReg = X86::RSI; break; 15329 case X86::DI: DestReg = X86::RDI; break; 15330 case X86::BP: DestReg = X86::RBP; break; 15331 case X86::SP: DestReg = X86::RSP; break; 15332 } 15333 if (DestReg) { 15334 Res.first = DestReg; 15335 Res.second = X86::GR64RegisterClass; 15336 } 15337 } 15338 } else if (Res.second == X86::FR32RegisterClass || 15339 Res.second == X86::FR64RegisterClass || 15340 Res.second == X86::VR128RegisterClass) { 15341 // Handle references to XMM physical registers that got mapped into the 15342 // wrong class. This can happen with constraints like {xmm0} where the 15343 // target independent register mapper will just pick the first match it can 15344 // find, ignoring the required type. 15345 if (VT == MVT::f32) 15346 Res.second = X86::FR32RegisterClass; 15347 else if (VT == MVT::f64) 15348 Res.second = X86::FR64RegisterClass; 15349 else if (X86::VR128RegisterClass->hasType(VT)) 15350 Res.second = X86::VR128RegisterClass; 15351 } 15352 15353 return Res; 15354} 15355