X86ISelLowering.cpp revision 54d8ebafc7cee5fc55d37f92e93a760e01cab27b
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86ShuffleDecode.h" 20#include "X86TargetMachine.h" 21#include "X86TargetObjectFile.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/CodeGen/PseudoSourceValue.h" 39#include "llvm/MC/MCAsmInfo.h" 40#include "llvm/MC/MCContext.h" 41#include "llvm/MC/MCExpr.h" 42#include "llvm/MC/MCSymbol.h" 43#include "llvm/ADT/BitVector.h" 44#include "llvm/ADT/SmallSet.h" 45#include "llvm/ADT/Statistic.h" 46#include "llvm/ADT/StringExtras.h" 47#include "llvm/ADT/VectorExtras.h" 48#include "llvm/Support/CommandLine.h" 49#include "llvm/Support/Debug.h" 50#include "llvm/Support/Dwarf.h" 51#include "llvm/Support/ErrorHandling.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54using namespace llvm; 55using namespace dwarf; 56 57STATISTIC(NumTailCalls, "Number of tail calls"); 58 59// Forward declarations. 60static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 61 SDValue V2); 62 63static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 64 65 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 66 67 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 68 if (is64Bit) 69 return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 } 72 73 if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 74 if (is64Bit) 75 return new X8664_ELFTargetObjectFile(TM); 76 return new X8632_ELFTargetObjectFile(TM); 77 } 78 if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) 79 return new TargetLoweringObjectFileCOFF(); 80 llvm_unreachable("unknown subtarget type"); 81} 82 83X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 84 : TargetLowering(TM, createTLOF(TM)) { 85 Subtarget = &TM.getSubtarget<X86Subtarget>(); 86 X86ScalarSSEf64 = Subtarget->hasXMMInt(); 87 X86ScalarSSEf32 = Subtarget->hasXMM(); 88 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 89 90 RegInfo = TM.getRegisterInfo(); 91 TD = getTargetData(); 92 93 // Set up the TargetLowering object. 94 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 95 96 // X86 is weird, it always uses i8 for shift amounts and setcc results. 97 setShiftAmountType(MVT::i8); 98 setBooleanContents(ZeroOrOneBooleanContent); 99 setSchedulingPreference(Sched::RegPressure); 100 setStackPointerRegisterToSaveRestore(X86StackPtr); 101 102 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 103 // Setup Windows compiler runtime calls. 104 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 105 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 106 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 107 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 108 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 109 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 110 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 111 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 112 } 113 114 if (Subtarget->isTargetDarwin()) { 115 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 116 setUseUnderscoreSetJmp(false); 117 setUseUnderscoreLongJmp(false); 118 } else if (Subtarget->isTargetMingw()) { 119 // MS runtime is weird: it exports _setjmp, but longjmp! 120 setUseUnderscoreSetJmp(true); 121 setUseUnderscoreLongJmp(false); 122 } else { 123 setUseUnderscoreSetJmp(true); 124 setUseUnderscoreLongJmp(true); 125 } 126 127 // Set up the register classes. 128 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 129 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 130 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 131 if (Subtarget->is64Bit()) 132 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 133 134 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 135 136 // We don't accept any truncstore of integer registers. 137 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 138 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 139 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 140 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 141 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 142 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 143 144 // SETOEQ and SETUNE require checking two conditions. 145 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 146 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 147 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 148 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 149 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 150 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 151 152 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 153 // operation. 154 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 155 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 156 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 157 158 if (Subtarget->is64Bit()) { 159 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 160 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 161 } else if (!UseSoftFloat) { 162 // We have an algorithm for SSE2->double, and we turn this into a 163 // 64-bit FILD followed by conditional FADD for other targets. 164 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 165 // We have an algorithm for SSE2, and we turn this into a 64-bit 166 // FILD for other targets. 167 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 168 } 169 170 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 171 // this operation. 172 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 173 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 174 175 if (!UseSoftFloat) { 176 // SSE has no i16 to fp conversion, only i32 177 if (X86ScalarSSEf32) { 178 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 179 // f32 and f64 cases are Legal, f80 case is not 180 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 181 } else { 182 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 183 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 184 } 185 } else { 186 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 187 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 188 } 189 190 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 191 // are Legal, f80 is custom lowered. 192 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 193 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 194 195 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 196 // this operation. 197 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 198 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 199 200 if (X86ScalarSSEf32) { 201 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 202 // f32 and f64 cases are Legal, f80 case is not 203 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 204 } else { 205 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 206 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 207 } 208 209 // Handle FP_TO_UINT by promoting the destination to a larger signed 210 // conversion. 211 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 212 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 213 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 214 215 if (Subtarget->is64Bit()) { 216 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 217 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 218 } else if (!UseSoftFloat) { 219 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 220 // Expand FP_TO_UINT into a select. 221 // FIXME: We would like to use a Custom expander here eventually to do 222 // the optimal thing for SSE vs. the default expansion in the legalizer. 223 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 224 else 225 // With SSE3 we can use fisttpll to convert to a signed i64; without 226 // SSE, we're stuck with a fistpll. 227 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 228 } 229 230 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 231 if (!X86ScalarSSEf64) { 232 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 233 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 234 if (Subtarget->is64Bit()) { 235 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 236 // Without SSE, i64->f64 goes through memory. 237 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 238 } 239 } 240 241 // Scalar integer divide and remainder are lowered to use operations that 242 // produce two results, to match the available instructions. This exposes 243 // the two-result form to trivial CSE, which is able to combine x/y and x%y 244 // into a single instruction. 245 // 246 // Scalar integer multiply-high is also lowered to use two-result 247 // operations, to match the available instructions. However, plain multiply 248 // (low) operations are left as Legal, as there are single-result 249 // instructions for this in x86. Using the two-result multiply instructions 250 // when both high and low results are needed must be arranged by dagcombine. 251 for (unsigned i = 0, e = 4; i != e; ++i) { 252 MVT VT = IntVTs[i]; 253 setOperationAction(ISD::MULHS, VT, Expand); 254 setOperationAction(ISD::MULHU, VT, Expand); 255 setOperationAction(ISD::SDIV, VT, Expand); 256 setOperationAction(ISD::UDIV, VT, Expand); 257 setOperationAction(ISD::SREM, VT, Expand); 258 setOperationAction(ISD::UREM, VT, Expand); 259 260 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 261 setOperationAction(ISD::ADDC, VT, Custom); 262 setOperationAction(ISD::ADDE, VT, Custom); 263 setOperationAction(ISD::SUBC, VT, Custom); 264 setOperationAction(ISD::SUBE, VT, Custom); 265 } 266 267 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 268 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 269 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 270 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 271 if (Subtarget->is64Bit()) 272 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 273 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 274 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 275 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 276 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 277 setOperationAction(ISD::FREM , MVT::f32 , Expand); 278 setOperationAction(ISD::FREM , MVT::f64 , Expand); 279 setOperationAction(ISD::FREM , MVT::f80 , Expand); 280 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 281 282 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 284 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 285 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 286 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 287 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 288 if (Subtarget->is64Bit()) { 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 if (Subtarget->hasPOPCNT()) { 294 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 295 } else { 296 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 297 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 298 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 299 if (Subtarget->is64Bit()) 300 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 301 } 302 303 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 304 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 305 306 // These should be promoted to a larger select which is supported. 307 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 308 // X86 wants to expand cmov itself. 309 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 310 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 311 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 312 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 313 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 314 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 315 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 316 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 317 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 318 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 319 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 320 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 321 if (Subtarget->is64Bit()) { 322 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 323 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 324 } 325 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 326 327 // Darwin ABI issue. 328 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 329 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 330 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 331 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 332 if (Subtarget->is64Bit()) 333 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 334 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 335 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 336 if (Subtarget->is64Bit()) { 337 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 338 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 339 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 340 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 341 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 342 } 343 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 344 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 345 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 346 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 347 if (Subtarget->is64Bit()) { 348 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 349 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 350 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 351 } 352 353 if (Subtarget->hasXMM()) 354 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 355 356 // We may not have a libcall for MEMBARRIER so we should lower this. 357 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 358 359 // On X86 and X86-64, atomic operations are lowered to locked instructions. 360 // Locked instructions, in turn, have implicit fence semantics (all memory 361 // operations are flushed before issuing the locked instruction, and they 362 // are not buffered), so we can fold away the common pattern of 363 // fence-atomic-fence. 364 setShouldFoldAtomicFences(true); 365 366 // Expand certain atomics 367 for (unsigned i = 0, e = 4; i != e; ++i) { 368 MVT VT = IntVTs[i]; 369 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 371 } 372 373 if (!Subtarget->is64Bit()) { 374 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 375 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 376 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 377 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 378 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 379 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 380 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 381 } 382 383 // FIXME - use subtarget debug flags 384 if (!Subtarget->isTargetDarwin() && 385 !Subtarget->isTargetELF() && 386 !Subtarget->isTargetCygMing()) { 387 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 388 } 389 390 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 391 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 392 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 393 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 394 if (Subtarget->is64Bit()) { 395 setExceptionPointerRegister(X86::RAX); 396 setExceptionSelectorRegister(X86::RDX); 397 } else { 398 setExceptionPointerRegister(X86::EAX); 399 setExceptionSelectorRegister(X86::EDX); 400 } 401 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 402 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 403 404 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 405 406 setOperationAction(ISD::TRAP, MVT::Other, Legal); 407 408 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 409 setOperationAction(ISD::VASTART , MVT::Other, Custom); 410 setOperationAction(ISD::VAEND , MVT::Other, Expand); 411 if (Subtarget->is64Bit()) { 412 setOperationAction(ISD::VAARG , MVT::Other, Custom); 413 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 414 } else { 415 setOperationAction(ISD::VAARG , MVT::Other, Expand); 416 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 417 } 418 419 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 420 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 421 if (Subtarget->is64Bit()) 422 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 423 if (Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) 424 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 425 else 426 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 427 428 if (!UseSoftFloat && X86ScalarSSEf64) { 429 // f32 and f64 use SSE. 430 // Set up the FP register classes. 431 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 432 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 433 434 // Use ANDPD to simulate FABS. 435 setOperationAction(ISD::FABS , MVT::f64, Custom); 436 setOperationAction(ISD::FABS , MVT::f32, Custom); 437 438 // Use XORP to simulate FNEG. 439 setOperationAction(ISD::FNEG , MVT::f64, Custom); 440 setOperationAction(ISD::FNEG , MVT::f32, Custom); 441 442 // Use ANDPD and ORPD to simulate FCOPYSIGN. 443 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 444 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 445 446 // We don't support sin/cos/fmod 447 setOperationAction(ISD::FSIN , MVT::f64, Expand); 448 setOperationAction(ISD::FCOS , MVT::f64, Expand); 449 setOperationAction(ISD::FSIN , MVT::f32, Expand); 450 setOperationAction(ISD::FCOS , MVT::f32, Expand); 451 452 // Expand FP immediates into loads from the stack, except for the special 453 // cases we handle. 454 addLegalFPImmediate(APFloat(+0.0)); // xorpd 455 addLegalFPImmediate(APFloat(+0.0f)); // xorps 456 } else if (!UseSoftFloat && X86ScalarSSEf32) { 457 // Use SSE for f32, x87 for f64. 458 // Set up the FP register classes. 459 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 460 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 461 462 // Use ANDPS to simulate FABS. 463 setOperationAction(ISD::FABS , MVT::f32, Custom); 464 465 // Use XORP to simulate FNEG. 466 setOperationAction(ISD::FNEG , MVT::f32, Custom); 467 468 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 469 470 // Use ANDPS and ORPS to simulate FCOPYSIGN. 471 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 472 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 473 474 // We don't support sin/cos/fmod 475 setOperationAction(ISD::FSIN , MVT::f32, Expand); 476 setOperationAction(ISD::FCOS , MVT::f32, Expand); 477 478 // Special cases we handle for FP constants. 479 addLegalFPImmediate(APFloat(+0.0f)); // xorps 480 addLegalFPImmediate(APFloat(+0.0)); // FLD0 481 addLegalFPImmediate(APFloat(+1.0)); // FLD1 482 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 483 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 484 485 if (!UnsafeFPMath) { 486 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 487 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 488 } 489 } else if (!UseSoftFloat) { 490 // f32 and f64 in x87. 491 // Set up the FP register classes. 492 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 493 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 494 495 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 496 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 497 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 498 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 499 500 if (!UnsafeFPMath) { 501 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 502 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 503 } 504 addLegalFPImmediate(APFloat(+0.0)); // FLD0 505 addLegalFPImmediate(APFloat(+1.0)); // FLD1 506 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 507 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 508 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 509 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 510 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 511 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 512 } 513 514 // Long double always uses X87. 515 if (!UseSoftFloat) { 516 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 517 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 518 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 519 { 520 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 521 addLegalFPImmediate(TmpFlt); // FLD0 522 TmpFlt.changeSign(); 523 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 524 525 bool ignored; 526 APFloat TmpFlt2(+1.0); 527 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 528 &ignored); 529 addLegalFPImmediate(TmpFlt2); // FLD1 530 TmpFlt2.changeSign(); 531 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 532 } 533 534 if (!UnsafeFPMath) { 535 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 536 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 537 } 538 } 539 540 // Always use a library call for pow. 541 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 542 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 543 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 544 545 setOperationAction(ISD::FLOG, MVT::f80, Expand); 546 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 547 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 548 setOperationAction(ISD::FEXP, MVT::f80, Expand); 549 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 550 551 // First set operation action for all vector types to either promote 552 // (for widening) or expand (for scalarization). Then we will selectively 553 // turn on ones that can be effectively codegen'd. 554 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 555 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 556 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 571 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 573 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 574 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 604 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 605 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 606 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 607 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 608 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 609 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 610 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 611 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 612 setTruncStoreAction((MVT::SimpleValueType)VT, 613 (MVT::SimpleValueType)InnerVT, Expand); 614 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 615 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 616 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 617 } 618 619 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 620 // with -msoft-float, disable use of MMX as well. 621 if (!UseSoftFloat && Subtarget->hasMMX()) { 622 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 623 // No operations on x86mmx supported, everything uses intrinsics. 624 } 625 626 // MMX-sized vectors (other than x86mmx) are expected to be expanded 627 // into smaller operations. 628 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 629 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 630 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 631 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 632 setOperationAction(ISD::AND, MVT::v8i8, Expand); 633 setOperationAction(ISD::AND, MVT::v4i16, Expand); 634 setOperationAction(ISD::AND, MVT::v2i32, Expand); 635 setOperationAction(ISD::AND, MVT::v1i64, Expand); 636 setOperationAction(ISD::OR, MVT::v8i8, Expand); 637 setOperationAction(ISD::OR, MVT::v4i16, Expand); 638 setOperationAction(ISD::OR, MVT::v2i32, Expand); 639 setOperationAction(ISD::OR, MVT::v1i64, Expand); 640 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 641 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 642 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 643 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 644 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 645 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 646 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 647 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 648 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 649 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 650 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 651 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 652 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 653 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 654 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 655 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 656 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 657 658 if (!UseSoftFloat && Subtarget->hasXMM()) { 659 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 660 661 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 662 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 663 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 664 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 665 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 666 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 667 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 669 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 670 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 671 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 672 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 673 } 674 675 if (!UseSoftFloat && Subtarget->hasXMMInt()) { 676 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 677 678 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 679 // registers cannot be used even for integer operations. 680 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 681 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 682 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 683 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 684 685 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 686 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 687 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 688 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 689 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 690 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 691 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 692 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 693 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 694 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 695 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 696 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 697 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 698 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 699 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 700 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 701 702 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 703 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 704 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 705 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 706 707 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 708 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 709 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 710 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 711 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 712 713 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 714 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 715 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 716 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 717 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 718 719 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 720 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 721 EVT VT = (MVT::SimpleValueType)i; 722 // Do not attempt to custom lower non-power-of-2 vectors 723 if (!isPowerOf2_32(VT.getVectorNumElements())) 724 continue; 725 // Do not attempt to custom lower non-128-bit vectors 726 if (!VT.is128BitVector()) 727 continue; 728 setOperationAction(ISD::BUILD_VECTOR, 729 VT.getSimpleVT().SimpleTy, Custom); 730 setOperationAction(ISD::VECTOR_SHUFFLE, 731 VT.getSimpleVT().SimpleTy, Custom); 732 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 733 VT.getSimpleVT().SimpleTy, Custom); 734 } 735 736 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 737 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 738 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 739 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 740 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 741 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 742 743 if (Subtarget->is64Bit()) { 744 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 745 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 746 } 747 748 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 749 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 750 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 751 EVT VT = SVT; 752 753 // Do not attempt to promote non-128-bit vectors 754 if (!VT.is128BitVector()) 755 continue; 756 757 setOperationAction(ISD::AND, SVT, Promote); 758 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 759 setOperationAction(ISD::OR, SVT, Promote); 760 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 761 setOperationAction(ISD::XOR, SVT, Promote); 762 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 763 setOperationAction(ISD::LOAD, SVT, Promote); 764 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 765 setOperationAction(ISD::SELECT, SVT, Promote); 766 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 767 } 768 769 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 770 771 // Custom lower v2i64 and v2f64 selects. 772 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 773 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 774 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 775 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 776 777 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 778 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 779 } 780 781 if (Subtarget->hasSSE41()) { 782 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 783 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 784 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 785 setOperationAction(ISD::FRINT, MVT::f32, Legal); 786 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 787 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 788 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 789 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 790 setOperationAction(ISD::FRINT, MVT::f64, Legal); 791 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 792 793 // FIXME: Do we need to handle scalar-to-vector here? 794 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 795 796 // Can turn SHL into an integer multiply. 797 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 798 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 799 800 // i8 and i16 vectors are custom , because the source register and source 801 // source memory operand types are not the same width. f32 vectors are 802 // custom since the immediate controlling the insert encodes additional 803 // information. 804 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 805 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 806 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 807 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 808 809 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 810 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 811 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 812 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 813 814 if (Subtarget->is64Bit()) { 815 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 816 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 817 } 818 } 819 820 if (Subtarget->hasSSE42()) 821 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 822 823 if (!UseSoftFloat && Subtarget->hasAVX()) { 824 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 825 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 826 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 827 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 828 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 829 830 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 831 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 832 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 833 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 834 835 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 836 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 837 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 838 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 839 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 840 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 841 842 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 843 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 844 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 845 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 846 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 847 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 848 849 // Custom lower build_vector, vector_shuffle, scalar_to_vector, 850 // insert_vector_elt extract_subvector and extract_vector_elt for 851 // 256-bit types. 852 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 853 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; 854 ++i) { 855 MVT::SimpleValueType VT = (MVT::SimpleValueType)i; 856 // Do not attempt to custom lower non-256-bit vectors 857 if (!isPowerOf2_32(MVT(VT).getVectorNumElements()) 858 || (MVT(VT).getSizeInBits() < 256)) 859 continue; 860 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 861 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 862 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 863 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 864 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 865 } 866 // Custom-lower insert_subvector and extract_subvector based on 867 // the result type. 868 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 869 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; 870 ++i) { 871 MVT::SimpleValueType VT = (MVT::SimpleValueType)i; 872 // Do not attempt to custom lower non-256-bit vectors 873 if (!isPowerOf2_32(MVT(VT).getVectorNumElements())) 874 continue; 875 876 if (MVT(VT).getSizeInBits() == 128) { 877 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 878 } 879 else if (MVT(VT).getSizeInBits() == 256) { 880 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 881 } 882 } 883 884 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 885 // Don't promote loads because we need them for VPERM vector index versions. 886 887 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 888 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; 889 VT++) { 890 if (!isPowerOf2_32(MVT((MVT::SimpleValueType)VT).getVectorNumElements()) 891 || (MVT((MVT::SimpleValueType)VT).getSizeInBits() < 256)) 892 continue; 893 setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote); 894 AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v4i64); 895 setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote); 896 AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v4i64); 897 setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote); 898 AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v4i64); 899 //setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote); 900 //AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v4i64); 901 setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote); 902 AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v4i64); 903 } 904 } 905 906 // We want to custom lower some of our intrinsics. 907 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 908 909 910 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 911 // handle type legalization for these operations here. 912 // 913 // FIXME: We really should do custom legalization for addition and 914 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 915 // than generic legalization for 64-bit multiplication-with-overflow, though. 916 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 917 // Add/Sub/Mul with overflow operations are custom lowered. 918 MVT VT = IntVTs[i]; 919 setOperationAction(ISD::SADDO, VT, Custom); 920 setOperationAction(ISD::UADDO, VT, Custom); 921 setOperationAction(ISD::SSUBO, VT, Custom); 922 setOperationAction(ISD::USUBO, VT, Custom); 923 setOperationAction(ISD::SMULO, VT, Custom); 924 setOperationAction(ISD::UMULO, VT, Custom); 925 } 926 927 // There are no 8-bit 3-address imul/mul instructions 928 setOperationAction(ISD::SMULO, MVT::i8, Expand); 929 setOperationAction(ISD::UMULO, MVT::i8, Expand); 930 931 if (!Subtarget->is64Bit()) { 932 // These libcalls are not available in 32-bit. 933 setLibcallName(RTLIB::SHL_I128, 0); 934 setLibcallName(RTLIB::SRL_I128, 0); 935 setLibcallName(RTLIB::SRA_I128, 0); 936 } 937 938 // We have target-specific dag combine patterns for the following nodes: 939 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 940 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 941 setTargetDAGCombine(ISD::BUILD_VECTOR); 942 setTargetDAGCombine(ISD::SELECT); 943 setTargetDAGCombine(ISD::SHL); 944 setTargetDAGCombine(ISD::SRA); 945 setTargetDAGCombine(ISD::SRL); 946 setTargetDAGCombine(ISD::OR); 947 setTargetDAGCombine(ISD::AND); 948 setTargetDAGCombine(ISD::ADD); 949 setTargetDAGCombine(ISD::SUB); 950 setTargetDAGCombine(ISD::STORE); 951 setTargetDAGCombine(ISD::ZERO_EXTEND); 952 if (Subtarget->is64Bit()) 953 setTargetDAGCombine(ISD::MUL); 954 955 computeRegisterProperties(); 956 957 // On Darwin, -Os means optimize for size without hurting performance, 958 // do not reduce the limit. 959 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 960 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 961 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 962 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 963 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 964 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 965 setPrefLoopAlignment(16); 966 benefitFromCodePlacementOpt = true; 967} 968 969 970MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 971 return MVT::i8; 972} 973 974 975/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 976/// the desired ByVal argument alignment. 977static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 978 if (MaxAlign == 16) 979 return; 980 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 981 if (VTy->getBitWidth() == 128) 982 MaxAlign = 16; 983 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 984 unsigned EltAlign = 0; 985 getMaxByValAlign(ATy->getElementType(), EltAlign); 986 if (EltAlign > MaxAlign) 987 MaxAlign = EltAlign; 988 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 989 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 990 unsigned EltAlign = 0; 991 getMaxByValAlign(STy->getElementType(i), EltAlign); 992 if (EltAlign > MaxAlign) 993 MaxAlign = EltAlign; 994 if (MaxAlign == 16) 995 break; 996 } 997 } 998 return; 999} 1000 1001/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1002/// function arguments in the caller parameter area. For X86, aggregates 1003/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1004/// are at 4-byte boundaries. 1005unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1006 if (Subtarget->is64Bit()) { 1007 // Max of 8 and alignment of type. 1008 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1009 if (TyAlign > 8) 1010 return TyAlign; 1011 return 8; 1012 } 1013 1014 unsigned Align = 4; 1015 if (Subtarget->hasXMM()) 1016 getMaxByValAlign(Ty, Align); 1017 return Align; 1018} 1019 1020/// getOptimalMemOpType - Returns the target specific optimal type for load 1021/// and store operations as a result of memset, memcpy, and memmove 1022/// lowering. If DstAlign is zero that means it's safe to destination 1023/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1024/// means there isn't a need to check it against alignment requirement, 1025/// probably because the source does not need to be loaded. If 1026/// 'NonScalarIntSafe' is true, that means it's safe to return a 1027/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1028/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1029/// constant so it does not need to be loaded. 1030/// It returns EVT::Other if the type should be determined using generic 1031/// target-independent logic. 1032EVT 1033X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1034 unsigned DstAlign, unsigned SrcAlign, 1035 bool NonScalarIntSafe, 1036 bool MemcpyStrSrc, 1037 MachineFunction &MF) const { 1038 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1039 // linux. This is because the stack realignment code can't handle certain 1040 // cases like PR2962. This should be removed when PR2962 is fixed. 1041 const Function *F = MF.getFunction(); 1042 if (NonScalarIntSafe && 1043 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1044 if (Size >= 16 && 1045 (Subtarget->isUnalignedMemAccessFast() || 1046 ((DstAlign == 0 || DstAlign >= 16) && 1047 (SrcAlign == 0 || SrcAlign >= 16))) && 1048 Subtarget->getStackAlignment() >= 16) { 1049 if (Subtarget->hasSSE2()) 1050 return MVT::v4i32; 1051 if (Subtarget->hasSSE1()) 1052 return MVT::v4f32; 1053 } else if (!MemcpyStrSrc && Size >= 8 && 1054 !Subtarget->is64Bit() && 1055 Subtarget->getStackAlignment() >= 8 && 1056 Subtarget->hasXMMInt()) { 1057 // Do not use f64 to lower memcpy if source is string constant. It's 1058 // better to use i32 to avoid the loads. 1059 return MVT::f64; 1060 } 1061 } 1062 if (Subtarget->is64Bit() && Size >= 8) 1063 return MVT::i64; 1064 return MVT::i32; 1065} 1066 1067/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1068/// current function. The returned value is a member of the 1069/// MachineJumpTableInfo::JTEntryKind enum. 1070unsigned X86TargetLowering::getJumpTableEncoding() const { 1071 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1072 // symbol. 1073 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1074 Subtarget->isPICStyleGOT()) 1075 return MachineJumpTableInfo::EK_Custom32; 1076 1077 // Otherwise, use the normal jump table encoding heuristics. 1078 return TargetLowering::getJumpTableEncoding(); 1079} 1080 1081const MCExpr * 1082X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1083 const MachineBasicBlock *MBB, 1084 unsigned uid,MCContext &Ctx) const{ 1085 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1086 Subtarget->isPICStyleGOT()); 1087 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1088 // entries. 1089 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1090 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1091} 1092 1093/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1094/// jumptable. 1095SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1096 SelectionDAG &DAG) const { 1097 if (!Subtarget->is64Bit()) 1098 // This doesn't have DebugLoc associated with it, but is not really the 1099 // same as a Register. 1100 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1101 return Table; 1102} 1103 1104/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1105/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1106/// MCExpr. 1107const MCExpr *X86TargetLowering:: 1108getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1109 MCContext &Ctx) const { 1110 // X86-64 uses RIP relative addressing based on the jump table label. 1111 if (Subtarget->isPICStyleRIPRel()) 1112 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1113 1114 // Otherwise, the reference is relative to the PIC base. 1115 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1116} 1117 1118/// getFunctionAlignment - Return the Log2 alignment of this function. 1119unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1120 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1121} 1122 1123// FIXME: Why this routine is here? Move to RegInfo! 1124std::pair<const TargetRegisterClass*, uint8_t> 1125X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1126 const TargetRegisterClass *RRC = 0; 1127 uint8_t Cost = 1; 1128 switch (VT.getSimpleVT().SimpleTy) { 1129 default: 1130 return TargetLowering::findRepresentativeClass(VT); 1131 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1132 RRC = (Subtarget->is64Bit() 1133 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1134 break; 1135 case MVT::x86mmx: 1136 RRC = X86::VR64RegisterClass; 1137 break; 1138 case MVT::f32: case MVT::f64: 1139 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1140 case MVT::v4f32: case MVT::v2f64: 1141 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1142 case MVT::v4f64: 1143 RRC = X86::VR128RegisterClass; 1144 break; 1145 } 1146 return std::make_pair(RRC, Cost); 1147} 1148 1149// FIXME: Why this routine is here? Move to RegInfo! 1150unsigned 1151X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1152 MachineFunction &MF) const { 1153 const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); 1154 1155 unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; 1156 switch (RC->getID()) { 1157 default: 1158 return 0; 1159 case X86::GR32RegClassID: 1160 return 4 - FPDiff; 1161 case X86::GR64RegClassID: 1162 return 8 - FPDiff; 1163 case X86::VR128RegClassID: 1164 return Subtarget->is64Bit() ? 10 : 4; 1165 case X86::VR64RegClassID: 1166 return 4; 1167 } 1168} 1169 1170bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1171 unsigned &Offset) const { 1172 if (!Subtarget->isTargetLinux()) 1173 return false; 1174 1175 if (Subtarget->is64Bit()) { 1176 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1177 Offset = 0x28; 1178 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1179 AddressSpace = 256; 1180 else 1181 AddressSpace = 257; 1182 } else { 1183 // %gs:0x14 on i386 1184 Offset = 0x14; 1185 AddressSpace = 256; 1186 } 1187 return true; 1188} 1189 1190 1191//===----------------------------------------------------------------------===// 1192// Return Value Calling Convention Implementation 1193//===----------------------------------------------------------------------===// 1194 1195#include "X86GenCallingConv.inc" 1196 1197bool 1198X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1199 const SmallVectorImpl<ISD::OutputArg> &Outs, 1200 LLVMContext &Context) const { 1201 SmallVector<CCValAssign, 16> RVLocs; 1202 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1203 RVLocs, Context); 1204 return CCInfo.CheckReturn(Outs, RetCC_X86); 1205} 1206 1207SDValue 1208X86TargetLowering::LowerReturn(SDValue Chain, 1209 CallingConv::ID CallConv, bool isVarArg, 1210 const SmallVectorImpl<ISD::OutputArg> &Outs, 1211 const SmallVectorImpl<SDValue> &OutVals, 1212 DebugLoc dl, SelectionDAG &DAG) const { 1213 MachineFunction &MF = DAG.getMachineFunction(); 1214 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1215 1216 SmallVector<CCValAssign, 16> RVLocs; 1217 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1218 RVLocs, *DAG.getContext()); 1219 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1220 1221 // Add the regs to the liveout set for the function. 1222 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1223 for (unsigned i = 0; i != RVLocs.size(); ++i) 1224 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1225 MRI.addLiveOut(RVLocs[i].getLocReg()); 1226 1227 SDValue Flag; 1228 1229 SmallVector<SDValue, 6> RetOps; 1230 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1231 // Operand #1 = Bytes To Pop 1232 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1233 MVT::i16)); 1234 1235 // Copy the result values into the output registers. 1236 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1237 CCValAssign &VA = RVLocs[i]; 1238 assert(VA.isRegLoc() && "Can only return in registers!"); 1239 SDValue ValToCopy = OutVals[i]; 1240 EVT ValVT = ValToCopy.getValueType(); 1241 1242 // If this is x86-64, and we disabled SSE, we can't return FP values, 1243 // or SSE or MMX vectors. 1244 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1245 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1246 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1247 report_fatal_error("SSE register return with SSE disabled"); 1248 } 1249 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1250 // llvm-gcc has never done it right and no one has noticed, so this 1251 // should be OK for now. 1252 if (ValVT == MVT::f64 && 1253 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1254 report_fatal_error("SSE2 register return with SSE2 disabled"); 1255 1256 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1257 // the RET instruction and handled by the FP Stackifier. 1258 if (VA.getLocReg() == X86::ST0 || 1259 VA.getLocReg() == X86::ST1) { 1260 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1261 // change the value to the FP stack register class. 1262 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1263 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1264 RetOps.push_back(ValToCopy); 1265 // Don't emit a copytoreg. 1266 continue; 1267 } 1268 1269 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1270 // which is returned in RAX / RDX. 1271 if (Subtarget->is64Bit()) { 1272 if (ValVT == MVT::x86mmx) { 1273 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1274 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1275 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1276 ValToCopy); 1277 // If we don't have SSE2 available, convert to v4f32 so the generated 1278 // register is legal. 1279 if (!Subtarget->hasSSE2()) 1280 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1281 } 1282 } 1283 } 1284 1285 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1286 Flag = Chain.getValue(1); 1287 } 1288 1289 // The x86-64 ABI for returning structs by value requires that we copy 1290 // the sret argument into %rax for the return. We saved the argument into 1291 // a virtual register in the entry block, so now we copy the value out 1292 // and into %rax. 1293 if (Subtarget->is64Bit() && 1294 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1295 MachineFunction &MF = DAG.getMachineFunction(); 1296 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1297 unsigned Reg = FuncInfo->getSRetReturnReg(); 1298 assert(Reg && 1299 "SRetReturnReg should have been set in LowerFormalArguments()."); 1300 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1301 1302 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1303 Flag = Chain.getValue(1); 1304 1305 // RAX now acts like a return value. 1306 MRI.addLiveOut(X86::RAX); 1307 } 1308 1309 RetOps[0] = Chain; // Update chain. 1310 1311 // Add the flag if we have it. 1312 if (Flag.getNode()) 1313 RetOps.push_back(Flag); 1314 1315 return DAG.getNode(X86ISD::RET_FLAG, dl, 1316 MVT::Other, &RetOps[0], RetOps.size()); 1317} 1318 1319bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1320 if (N->getNumValues() != 1) 1321 return false; 1322 if (!N->hasNUsesOfValue(1, 0)) 1323 return false; 1324 1325 SDNode *Copy = *N->use_begin(); 1326 if (Copy->getOpcode() != ISD::CopyToReg && 1327 Copy->getOpcode() != ISD::FP_EXTEND) 1328 return false; 1329 1330 bool HasRet = false; 1331 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1332 UI != UE; ++UI) { 1333 if (UI->getOpcode() != X86ISD::RET_FLAG) 1334 return false; 1335 HasRet = true; 1336 } 1337 1338 return HasRet; 1339} 1340 1341/// LowerCallResult - Lower the result values of a call into the 1342/// appropriate copies out of appropriate physical registers. 1343/// 1344SDValue 1345X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1346 CallingConv::ID CallConv, bool isVarArg, 1347 const SmallVectorImpl<ISD::InputArg> &Ins, 1348 DebugLoc dl, SelectionDAG &DAG, 1349 SmallVectorImpl<SDValue> &InVals) const { 1350 1351 // Assign locations to each value returned by this call. 1352 SmallVector<CCValAssign, 16> RVLocs; 1353 bool Is64Bit = Subtarget->is64Bit(); 1354 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1355 RVLocs, *DAG.getContext()); 1356 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1357 1358 // Copy all of the result registers out of their specified physreg. 1359 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1360 CCValAssign &VA = RVLocs[i]; 1361 EVT CopyVT = VA.getValVT(); 1362 1363 // If this is x86-64, and we disabled SSE, we can't return FP values 1364 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1365 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1366 report_fatal_error("SSE register return with SSE disabled"); 1367 } 1368 1369 SDValue Val; 1370 1371 // If this is a call to a function that returns an fp value on the floating 1372 // point stack, we must guarantee the the value is popped from the stack, so 1373 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1374 // if the return value is not used. We use the FpGET_ST0 instructions 1375 // instead. 1376 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1377 // If we prefer to use the value in xmm registers, copy it out as f80 and 1378 // use a truncate to move it from fp stack reg to xmm reg. 1379 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1380 bool isST0 = VA.getLocReg() == X86::ST0; 1381 unsigned Opc = 0; 1382 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1383 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1384 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1385 SDValue Ops[] = { Chain, InFlag }; 1386 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Glue, 1387 Ops, 2), 1); 1388 Val = Chain.getValue(0); 1389 1390 // Round the f80 to the right size, which also moves it to the appropriate 1391 // xmm register. 1392 if (CopyVT != VA.getValVT()) 1393 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1394 // This truncation won't change the value. 1395 DAG.getIntPtrConstant(1)); 1396 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1397 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1398 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1399 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1400 MVT::v2i64, InFlag).getValue(1); 1401 Val = Chain.getValue(0); 1402 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1403 Val, DAG.getConstant(0, MVT::i64)); 1404 } else { 1405 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1406 MVT::i64, InFlag).getValue(1); 1407 Val = Chain.getValue(0); 1408 } 1409 Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val); 1410 } else { 1411 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1412 CopyVT, InFlag).getValue(1); 1413 Val = Chain.getValue(0); 1414 } 1415 InFlag = Chain.getValue(2); 1416 InVals.push_back(Val); 1417 } 1418 1419 return Chain; 1420} 1421 1422 1423//===----------------------------------------------------------------------===// 1424// C & StdCall & Fast Calling Convention implementation 1425//===----------------------------------------------------------------------===// 1426// StdCall calling convention seems to be standard for many Windows' API 1427// routines and around. It differs from C calling convention just a little: 1428// callee should clean up the stack, not caller. Symbols should be also 1429// decorated in some fancy way :) It doesn't support any vector arguments. 1430// For info on fast calling convention see Fast Calling Convention (tail call) 1431// implementation LowerX86_32FastCCCallTo. 1432 1433/// CallIsStructReturn - Determines whether a call uses struct return 1434/// semantics. 1435static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1436 if (Outs.empty()) 1437 return false; 1438 1439 return Outs[0].Flags.isSRet(); 1440} 1441 1442/// ArgsAreStructReturn - Determines whether a function uses struct 1443/// return semantics. 1444static bool 1445ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1446 if (Ins.empty()) 1447 return false; 1448 1449 return Ins[0].Flags.isSRet(); 1450} 1451 1452/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1453/// by "Src" to address "Dst" with size and alignment information specified by 1454/// the specific parameter attribute. The copy will be passed as a byval 1455/// function parameter. 1456static SDValue 1457CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1458 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1459 DebugLoc dl) { 1460 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1461 1462 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1463 /*isVolatile*/false, /*AlwaysInline=*/true, 1464 MachinePointerInfo(), MachinePointerInfo()); 1465} 1466 1467/// IsTailCallConvention - Return true if the calling convention is one that 1468/// supports tail call optimization. 1469static bool IsTailCallConvention(CallingConv::ID CC) { 1470 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1471} 1472 1473/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1474/// a tailcall target by changing its ABI. 1475static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1476 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1477} 1478 1479SDValue 1480X86TargetLowering::LowerMemArgument(SDValue Chain, 1481 CallingConv::ID CallConv, 1482 const SmallVectorImpl<ISD::InputArg> &Ins, 1483 DebugLoc dl, SelectionDAG &DAG, 1484 const CCValAssign &VA, 1485 MachineFrameInfo *MFI, 1486 unsigned i) const { 1487 // Create the nodes corresponding to a load from this parameter slot. 1488 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1489 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1490 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1491 EVT ValVT; 1492 1493 // If value is passed by pointer we have address passed instead of the value 1494 // itself. 1495 if (VA.getLocInfo() == CCValAssign::Indirect) 1496 ValVT = VA.getLocVT(); 1497 else 1498 ValVT = VA.getValVT(); 1499 1500 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1501 // changed with more analysis. 1502 // In case of tail call optimization mark all arguments mutable. Since they 1503 // could be overwritten by lowering of arguments in case of a tail call. 1504 if (Flags.isByVal()) { 1505 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1506 VA.getLocMemOffset(), isImmutable); 1507 return DAG.getFrameIndex(FI, getPointerTy()); 1508 } else { 1509 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1510 VA.getLocMemOffset(), isImmutable); 1511 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1512 return DAG.getLoad(ValVT, dl, Chain, FIN, 1513 MachinePointerInfo::getFixedStack(FI), 1514 false, false, 0); 1515 } 1516} 1517 1518SDValue 1519X86TargetLowering::LowerFormalArguments(SDValue Chain, 1520 CallingConv::ID CallConv, 1521 bool isVarArg, 1522 const SmallVectorImpl<ISD::InputArg> &Ins, 1523 DebugLoc dl, 1524 SelectionDAG &DAG, 1525 SmallVectorImpl<SDValue> &InVals) 1526 const { 1527 MachineFunction &MF = DAG.getMachineFunction(); 1528 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1529 1530 const Function* Fn = MF.getFunction(); 1531 if (Fn->hasExternalLinkage() && 1532 Subtarget->isTargetCygMing() && 1533 Fn->getName() == "main") 1534 FuncInfo->setForceFramePointer(true); 1535 1536 MachineFrameInfo *MFI = MF.getFrameInfo(); 1537 bool Is64Bit = Subtarget->is64Bit(); 1538 bool IsWin64 = Subtarget->isTargetWin64(); 1539 1540 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1541 "Var args not supported with calling convention fastcc or ghc"); 1542 1543 // Assign locations to all of the incoming arguments. 1544 SmallVector<CCValAssign, 16> ArgLocs; 1545 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1546 ArgLocs, *DAG.getContext()); 1547 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1548 1549 unsigned LastVal = ~0U; 1550 SDValue ArgValue; 1551 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1552 CCValAssign &VA = ArgLocs[i]; 1553 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1554 // places. 1555 assert(VA.getValNo() != LastVal && 1556 "Don't support value assigned to multiple locs yet"); 1557 LastVal = VA.getValNo(); 1558 1559 if (VA.isRegLoc()) { 1560 EVT RegVT = VA.getLocVT(); 1561 TargetRegisterClass *RC = NULL; 1562 if (RegVT == MVT::i32) 1563 RC = X86::GR32RegisterClass; 1564 else if (Is64Bit && RegVT == MVT::i64) 1565 RC = X86::GR64RegisterClass; 1566 else if (RegVT == MVT::f32) 1567 RC = X86::FR32RegisterClass; 1568 else if (RegVT == MVT::f64) 1569 RC = X86::FR64RegisterClass; 1570 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1571 RC = X86::VR256RegisterClass; 1572 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1573 RC = X86::VR128RegisterClass; 1574 else if (RegVT == MVT::x86mmx) 1575 RC = X86::VR64RegisterClass; 1576 else 1577 llvm_unreachable("Unknown argument type!"); 1578 1579 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1580 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1581 1582 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1583 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1584 // right size. 1585 if (VA.getLocInfo() == CCValAssign::SExt) 1586 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1587 DAG.getValueType(VA.getValVT())); 1588 else if (VA.getLocInfo() == CCValAssign::ZExt) 1589 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1590 DAG.getValueType(VA.getValVT())); 1591 else if (VA.getLocInfo() == CCValAssign::BCvt) 1592 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1593 1594 if (VA.isExtInLoc()) { 1595 // Handle MMX values passed in XMM regs. 1596 if (RegVT.isVector()) { 1597 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1598 ArgValue); 1599 } else 1600 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1601 } 1602 } else { 1603 assert(VA.isMemLoc()); 1604 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1605 } 1606 1607 // If value is passed via pointer - do a load. 1608 if (VA.getLocInfo() == CCValAssign::Indirect) 1609 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1610 MachinePointerInfo(), false, false, 0); 1611 1612 InVals.push_back(ArgValue); 1613 } 1614 1615 // The x86-64 ABI for returning structs by value requires that we copy 1616 // the sret argument into %rax for the return. Save the argument into 1617 // a virtual register so that we can access it from the return points. 1618 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1619 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1620 unsigned Reg = FuncInfo->getSRetReturnReg(); 1621 if (!Reg) { 1622 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1623 FuncInfo->setSRetReturnReg(Reg); 1624 } 1625 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1626 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1627 } 1628 1629 unsigned StackSize = CCInfo.getNextStackOffset(); 1630 // Align stack specially for tail calls. 1631 if (FuncIsMadeTailCallSafe(CallConv)) 1632 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1633 1634 // If the function takes variable number of arguments, make a frame index for 1635 // the start of the first vararg value... for expansion of llvm.va_start. 1636 if (isVarArg) { 1637 if (!IsWin64 && (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1638 CallConv != CallingConv::X86_ThisCall))) { 1639 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1640 } 1641 if (Is64Bit) { 1642 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1643 1644 // FIXME: We should really autogenerate these arrays 1645 static const unsigned GPR64ArgRegsWin64[] = { 1646 X86::RCX, X86::RDX, X86::R8, X86::R9 1647 }; 1648 static const unsigned GPR64ArgRegs64Bit[] = { 1649 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1650 }; 1651 static const unsigned XMMArgRegs64Bit[] = { 1652 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1653 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1654 }; 1655 const unsigned *GPR64ArgRegs; 1656 unsigned NumXMMRegs = 0; 1657 1658 if (IsWin64) { 1659 // The XMM registers which might contain var arg parameters are shadowed 1660 // in their paired GPR. So we only need to save the GPR to their home 1661 // slots. 1662 TotalNumIntRegs = 4; 1663 GPR64ArgRegs = GPR64ArgRegsWin64; 1664 } else { 1665 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1666 GPR64ArgRegs = GPR64ArgRegs64Bit; 1667 1668 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1669 } 1670 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1671 TotalNumIntRegs); 1672 1673 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1674 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1675 "SSE register cannot be used when SSE is disabled!"); 1676 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1677 "SSE register cannot be used when SSE is disabled!"); 1678 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) 1679 // Kernel mode asks for SSE to be disabled, so don't push them 1680 // on the stack. 1681 TotalNumXMMRegs = 0; 1682 1683 if (IsWin64) { 1684 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1685 // Get to the caller-allocated home save location. Add 8 to account 1686 // for the return address. 1687 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1688 FuncInfo->setRegSaveFrameIndex( 1689 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1690 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1691 } else { 1692 // For X86-64, if there are vararg parameters that are passed via 1693 // registers, then we must store them to their spots on the stack so they 1694 // may be loaded by deferencing the result of va_next. 1695 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1696 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1697 FuncInfo->setRegSaveFrameIndex( 1698 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1699 false)); 1700 } 1701 1702 // Store the integer parameter registers. 1703 SmallVector<SDValue, 8> MemOps; 1704 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1705 getPointerTy()); 1706 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1707 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1708 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1709 DAG.getIntPtrConstant(Offset)); 1710 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1711 X86::GR64RegisterClass); 1712 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1713 SDValue Store = 1714 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1715 MachinePointerInfo::getFixedStack( 1716 FuncInfo->getRegSaveFrameIndex(), Offset), 1717 false, false, 0); 1718 MemOps.push_back(Store); 1719 Offset += 8; 1720 } 1721 1722 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1723 // Now store the XMM (fp + vector) parameter registers. 1724 SmallVector<SDValue, 11> SaveXMMOps; 1725 SaveXMMOps.push_back(Chain); 1726 1727 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1728 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1729 SaveXMMOps.push_back(ALVal); 1730 1731 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1732 FuncInfo->getRegSaveFrameIndex())); 1733 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1734 FuncInfo->getVarArgsFPOffset())); 1735 1736 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1737 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1738 X86::VR128RegisterClass); 1739 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1740 SaveXMMOps.push_back(Val); 1741 } 1742 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1743 MVT::Other, 1744 &SaveXMMOps[0], SaveXMMOps.size())); 1745 } 1746 1747 if (!MemOps.empty()) 1748 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1749 &MemOps[0], MemOps.size()); 1750 } 1751 } 1752 1753 // Some CCs need callee pop. 1754 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1755 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1756 } else { 1757 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1758 // If this is an sret function, the return should pop the hidden pointer. 1759 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1760 FuncInfo->setBytesToPopOnReturn(4); 1761 } 1762 1763 if (!Is64Bit) { 1764 // RegSaveFrameIndex is X86-64 only. 1765 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1766 if (CallConv == CallingConv::X86_FastCall || 1767 CallConv == CallingConv::X86_ThisCall) 1768 // fastcc functions can't have varargs. 1769 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1770 } 1771 1772 return Chain; 1773} 1774 1775SDValue 1776X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1777 SDValue StackPtr, SDValue Arg, 1778 DebugLoc dl, SelectionDAG &DAG, 1779 const CCValAssign &VA, 1780 ISD::ArgFlagsTy Flags) const { 1781 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1782 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1783 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1784 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1785 if (Flags.isByVal()) 1786 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1787 1788 return DAG.getStore(Chain, dl, Arg, PtrOff, 1789 MachinePointerInfo::getStack(LocMemOffset), 1790 false, false, 0); 1791} 1792 1793/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1794/// optimization is performed and it is required. 1795SDValue 1796X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1797 SDValue &OutRetAddr, SDValue Chain, 1798 bool IsTailCall, bool Is64Bit, 1799 int FPDiff, DebugLoc dl) const { 1800 // Adjust the Return address stack slot. 1801 EVT VT = getPointerTy(); 1802 OutRetAddr = getReturnAddressFrameIndex(DAG); 1803 1804 // Load the "old" Return address. 1805 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1806 false, false, 0); 1807 return SDValue(OutRetAddr.getNode(), 1); 1808} 1809 1810/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1811/// optimization is performed and it is required (FPDiff!=0). 1812static SDValue 1813EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1814 SDValue Chain, SDValue RetAddrFrIdx, 1815 bool Is64Bit, int FPDiff, DebugLoc dl) { 1816 // Store the return address to the appropriate stack slot. 1817 if (!FPDiff) return Chain; 1818 // Calculate the new stack slot for the return address. 1819 int SlotSize = Is64Bit ? 8 : 4; 1820 int NewReturnAddrFI = 1821 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1822 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1823 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1824 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1825 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1826 false, false, 0); 1827 return Chain; 1828} 1829 1830SDValue 1831X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1832 CallingConv::ID CallConv, bool isVarArg, 1833 bool &isTailCall, 1834 const SmallVectorImpl<ISD::OutputArg> &Outs, 1835 const SmallVectorImpl<SDValue> &OutVals, 1836 const SmallVectorImpl<ISD::InputArg> &Ins, 1837 DebugLoc dl, SelectionDAG &DAG, 1838 SmallVectorImpl<SDValue> &InVals) const { 1839 MachineFunction &MF = DAG.getMachineFunction(); 1840 bool Is64Bit = Subtarget->is64Bit(); 1841 bool IsStructRet = CallIsStructReturn(Outs); 1842 bool IsSibcall = false; 1843 1844 if (isTailCall) { 1845 // Check if it's really possible to do a tail call. 1846 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1847 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1848 Outs, OutVals, Ins, DAG); 1849 1850 // Sibcalls are automatically detected tailcalls which do not require 1851 // ABI changes. 1852 if (!GuaranteedTailCallOpt && isTailCall) 1853 IsSibcall = true; 1854 1855 if (isTailCall) 1856 ++NumTailCalls; 1857 } 1858 1859 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1860 "Var args not supported with calling convention fastcc or ghc"); 1861 1862 // Analyze operands of the call, assigning locations to each operand. 1863 SmallVector<CCValAssign, 16> ArgLocs; 1864 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1865 ArgLocs, *DAG.getContext()); 1866 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 1867 1868 // Get a count of how many bytes are to be pushed on the stack. 1869 unsigned NumBytes = CCInfo.getNextStackOffset(); 1870 if (IsSibcall) 1871 // This is a sibcall. The memory operands are available in caller's 1872 // own caller's stack. 1873 NumBytes = 0; 1874 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1875 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1876 1877 int FPDiff = 0; 1878 if (isTailCall && !IsSibcall) { 1879 // Lower arguments at fp - stackoffset + fpdiff. 1880 unsigned NumBytesCallerPushed = 1881 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1882 FPDiff = NumBytesCallerPushed - NumBytes; 1883 1884 // Set the delta of movement of the returnaddr stackslot. 1885 // But only set if delta is greater than previous delta. 1886 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1887 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1888 } 1889 1890 if (!IsSibcall) 1891 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1892 1893 SDValue RetAddrFrIdx; 1894 // Load return adress for tail calls. 1895 if (isTailCall && FPDiff) 1896 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1897 Is64Bit, FPDiff, dl); 1898 1899 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1900 SmallVector<SDValue, 8> MemOpChains; 1901 SDValue StackPtr; 1902 1903 // Walk the register/memloc assignments, inserting copies/loads. In the case 1904 // of tail call optimization arguments are handle later. 1905 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1906 CCValAssign &VA = ArgLocs[i]; 1907 EVT RegVT = VA.getLocVT(); 1908 SDValue Arg = OutVals[i]; 1909 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1910 bool isByVal = Flags.isByVal(); 1911 1912 // Promote the value if needed. 1913 switch (VA.getLocInfo()) { 1914 default: llvm_unreachable("Unknown loc info!"); 1915 case CCValAssign::Full: break; 1916 case CCValAssign::SExt: 1917 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1918 break; 1919 case CCValAssign::ZExt: 1920 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1921 break; 1922 case CCValAssign::AExt: 1923 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1924 // Special case: passing MMX values in XMM registers. 1925 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 1926 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1927 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1928 } else 1929 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1930 break; 1931 case CCValAssign::BCvt: 1932 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 1933 break; 1934 case CCValAssign::Indirect: { 1935 // Store the argument. 1936 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1937 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1938 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1939 MachinePointerInfo::getFixedStack(FI), 1940 false, false, 0); 1941 Arg = SpillSlot; 1942 break; 1943 } 1944 } 1945 1946 if (VA.isRegLoc()) { 1947 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1948 if (isVarArg && Subtarget->isTargetWin64()) { 1949 // Win64 ABI requires argument XMM reg to be copied to the corresponding 1950 // shadow reg if callee is a varargs function. 1951 unsigned ShadowReg = 0; 1952 switch (VA.getLocReg()) { 1953 case X86::XMM0: ShadowReg = X86::RCX; break; 1954 case X86::XMM1: ShadowReg = X86::RDX; break; 1955 case X86::XMM2: ShadowReg = X86::R8; break; 1956 case X86::XMM3: ShadowReg = X86::R9; break; 1957 } 1958 if (ShadowReg) 1959 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 1960 } 1961 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1962 assert(VA.isMemLoc()); 1963 if (StackPtr.getNode() == 0) 1964 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1965 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1966 dl, DAG, VA, Flags)); 1967 } 1968 } 1969 1970 if (!MemOpChains.empty()) 1971 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1972 &MemOpChains[0], MemOpChains.size()); 1973 1974 // Build a sequence of copy-to-reg nodes chained together with token chain 1975 // and flag operands which copy the outgoing args into registers. 1976 SDValue InFlag; 1977 // Tail call byval lowering might overwrite argument registers so in case of 1978 // tail call optimization the copies to registers are lowered later. 1979 if (!isTailCall) 1980 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1981 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1982 RegsToPass[i].second, InFlag); 1983 InFlag = Chain.getValue(1); 1984 } 1985 1986 if (Subtarget->isPICStyleGOT()) { 1987 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1988 // GOT pointer. 1989 if (!isTailCall) { 1990 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1991 DAG.getNode(X86ISD::GlobalBaseReg, 1992 DebugLoc(), getPointerTy()), 1993 InFlag); 1994 InFlag = Chain.getValue(1); 1995 } else { 1996 // If we are tail calling and generating PIC/GOT style code load the 1997 // address of the callee into ECX. The value in ecx is used as target of 1998 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1999 // for tail calls on PIC/GOT architectures. Normally we would just put the 2000 // address of GOT into ebx and then call target@PLT. But for tail calls 2001 // ebx would be restored (since ebx is callee saved) before jumping to the 2002 // target@PLT. 2003 2004 // Note: The actual moving to ECX is done further down. 2005 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2006 if (G && !G->getGlobal()->hasHiddenVisibility() && 2007 !G->getGlobal()->hasProtectedVisibility()) 2008 Callee = LowerGlobalAddress(Callee, DAG); 2009 else if (isa<ExternalSymbolSDNode>(Callee)) 2010 Callee = LowerExternalSymbol(Callee, DAG); 2011 } 2012 } 2013 2014 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2015 // From AMD64 ABI document: 2016 // For calls that may call functions that use varargs or stdargs 2017 // (prototype-less calls or calls to functions containing ellipsis (...) in 2018 // the declaration) %al is used as hidden argument to specify the number 2019 // of SSE registers used. The contents of %al do not need to match exactly 2020 // the number of registers, but must be an ubound on the number of SSE 2021 // registers used and is in the range 0 - 8 inclusive. 2022 2023 // Count the number of XMM registers allocated. 2024 static const unsigned XMMArgRegs[] = { 2025 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2026 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2027 }; 2028 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2029 assert((Subtarget->hasXMM() || !NumXMMRegs) 2030 && "SSE registers cannot be used when SSE is disabled"); 2031 2032 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2033 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2034 InFlag = Chain.getValue(1); 2035 } 2036 2037 2038 // For tail calls lower the arguments to the 'real' stack slot. 2039 if (isTailCall) { 2040 // Force all the incoming stack arguments to be loaded from the stack 2041 // before any new outgoing arguments are stored to the stack, because the 2042 // outgoing stack slots may alias the incoming argument stack slots, and 2043 // the alias isn't otherwise explicit. This is slightly more conservative 2044 // than necessary, because it means that each store effectively depends 2045 // on every argument instead of just those arguments it would clobber. 2046 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2047 2048 SmallVector<SDValue, 8> MemOpChains2; 2049 SDValue FIN; 2050 int FI = 0; 2051 // Do not flag preceeding copytoreg stuff together with the following stuff. 2052 InFlag = SDValue(); 2053 if (GuaranteedTailCallOpt) { 2054 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2055 CCValAssign &VA = ArgLocs[i]; 2056 if (VA.isRegLoc()) 2057 continue; 2058 assert(VA.isMemLoc()); 2059 SDValue Arg = OutVals[i]; 2060 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2061 // Create frame index. 2062 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2063 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2064 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2065 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2066 2067 if (Flags.isByVal()) { 2068 // Copy relative to framepointer. 2069 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2070 if (StackPtr.getNode() == 0) 2071 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2072 getPointerTy()); 2073 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2074 2075 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2076 ArgChain, 2077 Flags, DAG, dl)); 2078 } else { 2079 // Store relative to framepointer. 2080 MemOpChains2.push_back( 2081 DAG.getStore(ArgChain, dl, Arg, FIN, 2082 MachinePointerInfo::getFixedStack(FI), 2083 false, false, 0)); 2084 } 2085 } 2086 } 2087 2088 if (!MemOpChains2.empty()) 2089 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2090 &MemOpChains2[0], MemOpChains2.size()); 2091 2092 // Copy arguments to their registers. 2093 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2094 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2095 RegsToPass[i].second, InFlag); 2096 InFlag = Chain.getValue(1); 2097 } 2098 InFlag =SDValue(); 2099 2100 // Store the return address to the appropriate stack slot. 2101 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2102 FPDiff, dl); 2103 } 2104 2105 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2106 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2107 // In the 64-bit large code model, we have to make all calls 2108 // through a register, since the call instruction's 32-bit 2109 // pc-relative offset may not be large enough to hold the whole 2110 // address. 2111 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2112 // If the callee is a GlobalAddress node (quite common, every direct call 2113 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2114 // it. 2115 2116 // We should use extra load for direct calls to dllimported functions in 2117 // non-JIT mode. 2118 const GlobalValue *GV = G->getGlobal(); 2119 if (!GV->hasDLLImportLinkage()) { 2120 unsigned char OpFlags = 0; 2121 2122 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2123 // external symbols most go through the PLT in PIC mode. If the symbol 2124 // has hidden or protected visibility, or if it is static or local, then 2125 // we don't need to use the PLT - we can directly call it. 2126 if (Subtarget->isTargetELF() && 2127 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2128 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2129 OpFlags = X86II::MO_PLT; 2130 } else if (Subtarget->isPICStyleStubAny() && 2131 (GV->isDeclaration() || GV->isWeakForLinker()) && 2132 Subtarget->getDarwinVers() < 9) { 2133 // PC-relative references to external symbols should go through $stub, 2134 // unless we're building with the leopard linker or later, which 2135 // automatically synthesizes these stubs. 2136 OpFlags = X86II::MO_DARWIN_STUB; 2137 } 2138 2139 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2140 G->getOffset(), OpFlags); 2141 } 2142 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2143 unsigned char OpFlags = 0; 2144 2145 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2146 // external symbols should go through the PLT. 2147 if (Subtarget->isTargetELF() && 2148 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2149 OpFlags = X86II::MO_PLT; 2150 } else if (Subtarget->isPICStyleStubAny() && 2151 Subtarget->getDarwinVers() < 9) { 2152 // PC-relative references to external symbols should go through $stub, 2153 // unless we're building with the leopard linker or later, which 2154 // automatically synthesizes these stubs. 2155 OpFlags = X86II::MO_DARWIN_STUB; 2156 } 2157 2158 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2159 OpFlags); 2160 } 2161 2162 // Returns a chain & a flag for retval copy to use. 2163 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2164 SmallVector<SDValue, 8> Ops; 2165 2166 if (!IsSibcall && isTailCall) { 2167 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2168 DAG.getIntPtrConstant(0, true), InFlag); 2169 InFlag = Chain.getValue(1); 2170 } 2171 2172 Ops.push_back(Chain); 2173 Ops.push_back(Callee); 2174 2175 if (isTailCall) 2176 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2177 2178 // Add argument registers to the end of the list so that they are known live 2179 // into the call. 2180 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2181 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2182 RegsToPass[i].second.getValueType())); 2183 2184 // Add an implicit use GOT pointer in EBX. 2185 if (!isTailCall && Subtarget->isPICStyleGOT()) 2186 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2187 2188 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2189 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2190 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2191 2192 if (InFlag.getNode()) 2193 Ops.push_back(InFlag); 2194 2195 if (isTailCall) { 2196 // We used to do: 2197 //// If this is the first return lowered for this function, add the regs 2198 //// to the liveout set for the function. 2199 // This isn't right, although it's probably harmless on x86; liveouts 2200 // should be computed from returns not tail calls. Consider a void 2201 // function making a tail call to a function returning int. 2202 return DAG.getNode(X86ISD::TC_RETURN, dl, 2203 NodeTys, &Ops[0], Ops.size()); 2204 } 2205 2206 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2207 InFlag = Chain.getValue(1); 2208 2209 // Create the CALLSEQ_END node. 2210 unsigned NumBytesForCalleeToPush; 2211 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2212 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2213 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2214 // If this is a call to a struct-return function, the callee 2215 // pops the hidden struct pointer, so we have to push it back. 2216 // This is common for Darwin/X86, Linux & Mingw32 targets. 2217 NumBytesForCalleeToPush = 4; 2218 else 2219 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2220 2221 // Returns a flag for retval copy to use. 2222 if (!IsSibcall) { 2223 Chain = DAG.getCALLSEQ_END(Chain, 2224 DAG.getIntPtrConstant(NumBytes, true), 2225 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2226 true), 2227 InFlag); 2228 InFlag = Chain.getValue(1); 2229 } 2230 2231 // Handle result values, copying them out of physregs into vregs that we 2232 // return. 2233 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2234 Ins, dl, DAG, InVals); 2235} 2236 2237 2238//===----------------------------------------------------------------------===// 2239// Fast Calling Convention (tail call) implementation 2240//===----------------------------------------------------------------------===// 2241 2242// Like std call, callee cleans arguments, convention except that ECX is 2243// reserved for storing the tail called function address. Only 2 registers are 2244// free for argument passing (inreg). Tail call optimization is performed 2245// provided: 2246// * tailcallopt is enabled 2247// * caller/callee are fastcc 2248// On X86_64 architecture with GOT-style position independent code only local 2249// (within module) calls are supported at the moment. 2250// To keep the stack aligned according to platform abi the function 2251// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2252// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2253// If a tail called function callee has more arguments than the caller the 2254// caller needs to make sure that there is room to move the RETADDR to. This is 2255// achieved by reserving an area the size of the argument delta right after the 2256// original REtADDR, but before the saved framepointer or the spilled registers 2257// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2258// stack layout: 2259// arg1 2260// arg2 2261// RETADDR 2262// [ new RETADDR 2263// move area ] 2264// (possible EBP) 2265// ESI 2266// EDI 2267// local1 .. 2268 2269/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2270/// for a 16 byte align requirement. 2271unsigned 2272X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2273 SelectionDAG& DAG) const { 2274 MachineFunction &MF = DAG.getMachineFunction(); 2275 const TargetMachine &TM = MF.getTarget(); 2276 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2277 unsigned StackAlignment = TFI.getStackAlignment(); 2278 uint64_t AlignMask = StackAlignment - 1; 2279 int64_t Offset = StackSize; 2280 uint64_t SlotSize = TD->getPointerSize(); 2281 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2282 // Number smaller than 12 so just add the difference. 2283 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2284 } else { 2285 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2286 Offset = ((~AlignMask) & Offset) + StackAlignment + 2287 (StackAlignment-SlotSize); 2288 } 2289 return Offset; 2290} 2291 2292/// MatchingStackOffset - Return true if the given stack call argument is 2293/// already available in the same position (relatively) of the caller's 2294/// incoming argument stack. 2295static 2296bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2297 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2298 const X86InstrInfo *TII) { 2299 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2300 int FI = INT_MAX; 2301 if (Arg.getOpcode() == ISD::CopyFromReg) { 2302 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2303 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2304 return false; 2305 MachineInstr *Def = MRI->getVRegDef(VR); 2306 if (!Def) 2307 return false; 2308 if (!Flags.isByVal()) { 2309 if (!TII->isLoadFromStackSlot(Def, FI)) 2310 return false; 2311 } else { 2312 unsigned Opcode = Def->getOpcode(); 2313 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2314 Def->getOperand(1).isFI()) { 2315 FI = Def->getOperand(1).getIndex(); 2316 Bytes = Flags.getByValSize(); 2317 } else 2318 return false; 2319 } 2320 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2321 if (Flags.isByVal()) 2322 // ByVal argument is passed in as a pointer but it's now being 2323 // dereferenced. e.g. 2324 // define @foo(%struct.X* %A) { 2325 // tail call @bar(%struct.X* byval %A) 2326 // } 2327 return false; 2328 SDValue Ptr = Ld->getBasePtr(); 2329 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2330 if (!FINode) 2331 return false; 2332 FI = FINode->getIndex(); 2333 } else 2334 return false; 2335 2336 assert(FI != INT_MAX); 2337 if (!MFI->isFixedObjectIndex(FI)) 2338 return false; 2339 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2340} 2341 2342/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2343/// for tail call optimization. Targets which want to do tail call 2344/// optimization should implement this function. 2345bool 2346X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2347 CallingConv::ID CalleeCC, 2348 bool isVarArg, 2349 bool isCalleeStructRet, 2350 bool isCallerStructRet, 2351 const SmallVectorImpl<ISD::OutputArg> &Outs, 2352 const SmallVectorImpl<SDValue> &OutVals, 2353 const SmallVectorImpl<ISD::InputArg> &Ins, 2354 SelectionDAG& DAG) const { 2355 if (!IsTailCallConvention(CalleeCC) && 2356 CalleeCC != CallingConv::C) 2357 return false; 2358 2359 // If -tailcallopt is specified, make fastcc functions tail-callable. 2360 const MachineFunction &MF = DAG.getMachineFunction(); 2361 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2362 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2363 bool CCMatch = CallerCC == CalleeCC; 2364 2365 if (GuaranteedTailCallOpt) { 2366 if (IsTailCallConvention(CalleeCC) && CCMatch) 2367 return true; 2368 return false; 2369 } 2370 2371 // Look for obvious safe cases to perform tail call optimization that do not 2372 // require ABI changes. This is what gcc calls sibcall. 2373 2374 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2375 // emit a special epilogue. 2376 if (RegInfo->needsStackRealignment(MF)) 2377 return false; 2378 2379 // Do not sibcall optimize vararg calls unless the call site is not passing 2380 // any arguments. 2381 if (isVarArg && !Outs.empty()) 2382 return false; 2383 2384 // Also avoid sibcall optimization if either caller or callee uses struct 2385 // return semantics. 2386 if (isCalleeStructRet || isCallerStructRet) 2387 return false; 2388 2389 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2390 // Therefore if it's not used by the call it is not safe to optimize this into 2391 // a sibcall. 2392 bool Unused = false; 2393 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2394 if (!Ins[i].Used) { 2395 Unused = true; 2396 break; 2397 } 2398 } 2399 if (Unused) { 2400 SmallVector<CCValAssign, 16> RVLocs; 2401 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2402 RVLocs, *DAG.getContext()); 2403 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2404 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2405 CCValAssign &VA = RVLocs[i]; 2406 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2407 return false; 2408 } 2409 } 2410 2411 // If the calling conventions do not match, then we'd better make sure the 2412 // results are returned in the same way as what the caller expects. 2413 if (!CCMatch) { 2414 SmallVector<CCValAssign, 16> RVLocs1; 2415 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2416 RVLocs1, *DAG.getContext()); 2417 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2418 2419 SmallVector<CCValAssign, 16> RVLocs2; 2420 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2421 RVLocs2, *DAG.getContext()); 2422 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2423 2424 if (RVLocs1.size() != RVLocs2.size()) 2425 return false; 2426 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2427 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2428 return false; 2429 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2430 return false; 2431 if (RVLocs1[i].isRegLoc()) { 2432 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2433 return false; 2434 } else { 2435 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2436 return false; 2437 } 2438 } 2439 } 2440 2441 // If the callee takes no arguments then go on to check the results of the 2442 // call. 2443 if (!Outs.empty()) { 2444 // Check if stack adjustment is needed. For now, do not do this if any 2445 // argument is passed on the stack. 2446 SmallVector<CCValAssign, 16> ArgLocs; 2447 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2448 ArgLocs, *DAG.getContext()); 2449 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2450 if (CCInfo.getNextStackOffset()) { 2451 MachineFunction &MF = DAG.getMachineFunction(); 2452 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2453 return false; 2454 2455 // Check if the arguments are already laid out in the right way as 2456 // the caller's fixed stack objects. 2457 MachineFrameInfo *MFI = MF.getFrameInfo(); 2458 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2459 const X86InstrInfo *TII = 2460 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2461 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2462 CCValAssign &VA = ArgLocs[i]; 2463 SDValue Arg = OutVals[i]; 2464 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2465 if (VA.getLocInfo() == CCValAssign::Indirect) 2466 return false; 2467 if (!VA.isRegLoc()) { 2468 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2469 MFI, MRI, TII)) 2470 return false; 2471 } 2472 } 2473 } 2474 2475 // If the tailcall address may be in a register, then make sure it's 2476 // possible to register allocate for it. In 32-bit, the call address can 2477 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2478 // callee-saved registers are restored. These happen to be the same 2479 // registers used to pass 'inreg' arguments so watch out for those. 2480 if (!Subtarget->is64Bit() && 2481 !isa<GlobalAddressSDNode>(Callee) && 2482 !isa<ExternalSymbolSDNode>(Callee)) { 2483 unsigned NumInRegs = 0; 2484 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2485 CCValAssign &VA = ArgLocs[i]; 2486 if (!VA.isRegLoc()) 2487 continue; 2488 unsigned Reg = VA.getLocReg(); 2489 switch (Reg) { 2490 default: break; 2491 case X86::EAX: case X86::EDX: case X86::ECX: 2492 if (++NumInRegs == 3) 2493 return false; 2494 break; 2495 } 2496 } 2497 } 2498 } 2499 2500 // An stdcall caller is expected to clean up its arguments; the callee 2501 // isn't going to do that. 2502 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2503 return false; 2504 2505 return true; 2506} 2507 2508FastISel * 2509X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2510 return X86::createFastISel(funcInfo); 2511} 2512 2513 2514//===----------------------------------------------------------------------===// 2515// Other Lowering Hooks 2516//===----------------------------------------------------------------------===// 2517 2518static bool MayFoldLoad(SDValue Op) { 2519 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2520} 2521 2522static bool MayFoldIntoStore(SDValue Op) { 2523 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2524} 2525 2526static bool isTargetShuffle(unsigned Opcode) { 2527 switch(Opcode) { 2528 default: return false; 2529 case X86ISD::PSHUFD: 2530 case X86ISD::PSHUFHW: 2531 case X86ISD::PSHUFLW: 2532 case X86ISD::SHUFPD: 2533 case X86ISD::PALIGN: 2534 case X86ISD::SHUFPS: 2535 case X86ISD::MOVLHPS: 2536 case X86ISD::MOVLHPD: 2537 case X86ISD::MOVHLPS: 2538 case X86ISD::MOVLPS: 2539 case X86ISD::MOVLPD: 2540 case X86ISD::MOVSHDUP: 2541 case X86ISD::MOVSLDUP: 2542 case X86ISD::MOVDDUP: 2543 case X86ISD::MOVSS: 2544 case X86ISD::MOVSD: 2545 case X86ISD::UNPCKLPS: 2546 case X86ISD::UNPCKLPD: 2547 case X86ISD::PUNPCKLWD: 2548 case X86ISD::PUNPCKLBW: 2549 case X86ISD::PUNPCKLDQ: 2550 case X86ISD::PUNPCKLQDQ: 2551 case X86ISD::UNPCKHPS: 2552 case X86ISD::UNPCKHPD: 2553 case X86ISD::PUNPCKHWD: 2554 case X86ISD::PUNPCKHBW: 2555 case X86ISD::PUNPCKHDQ: 2556 case X86ISD::PUNPCKHQDQ: 2557 return true; 2558 } 2559 return false; 2560} 2561 2562static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2563 SDValue V1, SelectionDAG &DAG) { 2564 switch(Opc) { 2565 default: llvm_unreachable("Unknown x86 shuffle node"); 2566 case X86ISD::MOVSHDUP: 2567 case X86ISD::MOVSLDUP: 2568 case X86ISD::MOVDDUP: 2569 return DAG.getNode(Opc, dl, VT, V1); 2570 } 2571 2572 return SDValue(); 2573} 2574 2575static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2576 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2577 switch(Opc) { 2578 default: llvm_unreachable("Unknown x86 shuffle node"); 2579 case X86ISD::PSHUFD: 2580 case X86ISD::PSHUFHW: 2581 case X86ISD::PSHUFLW: 2582 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2583 } 2584 2585 return SDValue(); 2586} 2587 2588static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2589 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2590 switch(Opc) { 2591 default: llvm_unreachable("Unknown x86 shuffle node"); 2592 case X86ISD::PALIGN: 2593 case X86ISD::SHUFPD: 2594 case X86ISD::SHUFPS: 2595 return DAG.getNode(Opc, dl, VT, V1, V2, 2596 DAG.getConstant(TargetMask, MVT::i8)); 2597 } 2598 return SDValue(); 2599} 2600 2601static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2602 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2603 switch(Opc) { 2604 default: llvm_unreachable("Unknown x86 shuffle node"); 2605 case X86ISD::MOVLHPS: 2606 case X86ISD::MOVLHPD: 2607 case X86ISD::MOVHLPS: 2608 case X86ISD::MOVLPS: 2609 case X86ISD::MOVLPD: 2610 case X86ISD::MOVSS: 2611 case X86ISD::MOVSD: 2612 case X86ISD::UNPCKLPS: 2613 case X86ISD::UNPCKLPD: 2614 case X86ISD::PUNPCKLWD: 2615 case X86ISD::PUNPCKLBW: 2616 case X86ISD::PUNPCKLDQ: 2617 case X86ISD::PUNPCKLQDQ: 2618 case X86ISD::UNPCKHPS: 2619 case X86ISD::UNPCKHPD: 2620 case X86ISD::PUNPCKHWD: 2621 case X86ISD::PUNPCKHBW: 2622 case X86ISD::PUNPCKHDQ: 2623 case X86ISD::PUNPCKHQDQ: 2624 return DAG.getNode(Opc, dl, VT, V1, V2); 2625 } 2626 return SDValue(); 2627} 2628 2629SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2630 MachineFunction &MF = DAG.getMachineFunction(); 2631 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2632 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2633 2634 if (ReturnAddrIndex == 0) { 2635 // Set up a frame object for the return address. 2636 uint64_t SlotSize = TD->getPointerSize(); 2637 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2638 false); 2639 FuncInfo->setRAIndex(ReturnAddrIndex); 2640 } 2641 2642 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2643} 2644 2645 2646bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2647 bool hasSymbolicDisplacement) { 2648 // Offset should fit into 32 bit immediate field. 2649 if (!isInt<32>(Offset)) 2650 return false; 2651 2652 // If we don't have a symbolic displacement - we don't have any extra 2653 // restrictions. 2654 if (!hasSymbolicDisplacement) 2655 return true; 2656 2657 // FIXME: Some tweaks might be needed for medium code model. 2658 if (M != CodeModel::Small && M != CodeModel::Kernel) 2659 return false; 2660 2661 // For small code model we assume that latest object is 16MB before end of 31 2662 // bits boundary. We may also accept pretty large negative constants knowing 2663 // that all objects are in the positive half of address space. 2664 if (M == CodeModel::Small && Offset < 16*1024*1024) 2665 return true; 2666 2667 // For kernel code model we know that all object resist in the negative half 2668 // of 32bits address space. We may not accept negative offsets, since they may 2669 // be just off and we may accept pretty large positive ones. 2670 if (M == CodeModel::Kernel && Offset > 0) 2671 return true; 2672 2673 return false; 2674} 2675 2676/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2677/// specific condition code, returning the condition code and the LHS/RHS of the 2678/// comparison to make. 2679static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2680 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2681 if (!isFP) { 2682 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2683 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2684 // X > -1 -> X == 0, jump !sign. 2685 RHS = DAG.getConstant(0, RHS.getValueType()); 2686 return X86::COND_NS; 2687 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2688 // X < 0 -> X == 0, jump on sign. 2689 return X86::COND_S; 2690 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2691 // X < 1 -> X <= 0 2692 RHS = DAG.getConstant(0, RHS.getValueType()); 2693 return X86::COND_LE; 2694 } 2695 } 2696 2697 switch (SetCCOpcode) { 2698 default: llvm_unreachable("Invalid integer condition!"); 2699 case ISD::SETEQ: return X86::COND_E; 2700 case ISD::SETGT: return X86::COND_G; 2701 case ISD::SETGE: return X86::COND_GE; 2702 case ISD::SETLT: return X86::COND_L; 2703 case ISD::SETLE: return X86::COND_LE; 2704 case ISD::SETNE: return X86::COND_NE; 2705 case ISD::SETULT: return X86::COND_B; 2706 case ISD::SETUGT: return X86::COND_A; 2707 case ISD::SETULE: return X86::COND_BE; 2708 case ISD::SETUGE: return X86::COND_AE; 2709 } 2710 } 2711 2712 // First determine if it is required or is profitable to flip the operands. 2713 2714 // If LHS is a foldable load, but RHS is not, flip the condition. 2715 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2716 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2717 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2718 std::swap(LHS, RHS); 2719 } 2720 2721 switch (SetCCOpcode) { 2722 default: break; 2723 case ISD::SETOLT: 2724 case ISD::SETOLE: 2725 case ISD::SETUGT: 2726 case ISD::SETUGE: 2727 std::swap(LHS, RHS); 2728 break; 2729 } 2730 2731 // On a floating point condition, the flags are set as follows: 2732 // ZF PF CF op 2733 // 0 | 0 | 0 | X > Y 2734 // 0 | 0 | 1 | X < Y 2735 // 1 | 0 | 0 | X == Y 2736 // 1 | 1 | 1 | unordered 2737 switch (SetCCOpcode) { 2738 default: llvm_unreachable("Condcode should be pre-legalized away"); 2739 case ISD::SETUEQ: 2740 case ISD::SETEQ: return X86::COND_E; 2741 case ISD::SETOLT: // flipped 2742 case ISD::SETOGT: 2743 case ISD::SETGT: return X86::COND_A; 2744 case ISD::SETOLE: // flipped 2745 case ISD::SETOGE: 2746 case ISD::SETGE: return X86::COND_AE; 2747 case ISD::SETUGT: // flipped 2748 case ISD::SETULT: 2749 case ISD::SETLT: return X86::COND_B; 2750 case ISD::SETUGE: // flipped 2751 case ISD::SETULE: 2752 case ISD::SETLE: return X86::COND_BE; 2753 case ISD::SETONE: 2754 case ISD::SETNE: return X86::COND_NE; 2755 case ISD::SETUO: return X86::COND_P; 2756 case ISD::SETO: return X86::COND_NP; 2757 case ISD::SETOEQ: 2758 case ISD::SETUNE: return X86::COND_INVALID; 2759 } 2760} 2761 2762/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2763/// code. Current x86 isa includes the following FP cmov instructions: 2764/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2765static bool hasFPCMov(unsigned X86CC) { 2766 switch (X86CC) { 2767 default: 2768 return false; 2769 case X86::COND_B: 2770 case X86::COND_BE: 2771 case X86::COND_E: 2772 case X86::COND_P: 2773 case X86::COND_A: 2774 case X86::COND_AE: 2775 case X86::COND_NE: 2776 case X86::COND_NP: 2777 return true; 2778 } 2779} 2780 2781/// isFPImmLegal - Returns true if the target can instruction select the 2782/// specified FP immediate natively. If false, the legalizer will 2783/// materialize the FP immediate as a load from a constant pool. 2784bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2785 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2786 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2787 return true; 2788 } 2789 return false; 2790} 2791 2792/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2793/// the specified range (L, H]. 2794static bool isUndefOrInRange(int Val, int Low, int Hi) { 2795 return (Val < 0) || (Val >= Low && Val < Hi); 2796} 2797 2798/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2799/// specified value. 2800static bool isUndefOrEqual(int Val, int CmpVal) { 2801 if (Val < 0 || Val == CmpVal) 2802 return true; 2803 return false; 2804} 2805 2806/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2807/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2808/// the second operand. 2809static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2810 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 2811 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2812 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2813 return (Mask[0] < 2 && Mask[1] < 2); 2814 return false; 2815} 2816 2817bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2818 SmallVector<int, 8> M; 2819 N->getMask(M); 2820 return ::isPSHUFDMask(M, N->getValueType(0)); 2821} 2822 2823/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2824/// is suitable for input to PSHUFHW. 2825static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2826 if (VT != MVT::v8i16) 2827 return false; 2828 2829 // Lower quadword copied in order or undef. 2830 for (int i = 0; i != 4; ++i) 2831 if (Mask[i] >= 0 && Mask[i] != i) 2832 return false; 2833 2834 // Upper quadword shuffled. 2835 for (int i = 4; i != 8; ++i) 2836 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2837 return false; 2838 2839 return true; 2840} 2841 2842bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2843 SmallVector<int, 8> M; 2844 N->getMask(M); 2845 return ::isPSHUFHWMask(M, N->getValueType(0)); 2846} 2847 2848/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2849/// is suitable for input to PSHUFLW. 2850static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2851 if (VT != MVT::v8i16) 2852 return false; 2853 2854 // Upper quadword copied in order. 2855 for (int i = 4; i != 8; ++i) 2856 if (Mask[i] >= 0 && Mask[i] != i) 2857 return false; 2858 2859 // Lower quadword shuffled. 2860 for (int i = 0; i != 4; ++i) 2861 if (Mask[i] >= 4) 2862 return false; 2863 2864 return true; 2865} 2866 2867bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2868 SmallVector<int, 8> M; 2869 N->getMask(M); 2870 return ::isPSHUFLWMask(M, N->getValueType(0)); 2871} 2872 2873/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2874/// is suitable for input to PALIGNR. 2875static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2876 bool hasSSSE3) { 2877 int i, e = VT.getVectorNumElements(); 2878 2879 // Do not handle v2i64 / v2f64 shuffles with palignr. 2880 if (e < 4 || !hasSSSE3) 2881 return false; 2882 2883 for (i = 0; i != e; ++i) 2884 if (Mask[i] >= 0) 2885 break; 2886 2887 // All undef, not a palignr. 2888 if (i == e) 2889 return false; 2890 2891 // Determine if it's ok to perform a palignr with only the LHS, since we 2892 // don't have access to the actual shuffle elements to see if RHS is undef. 2893 bool Unary = Mask[i] < (int)e; 2894 bool NeedsUnary = false; 2895 2896 int s = Mask[i] - i; 2897 2898 // Check the rest of the elements to see if they are consecutive. 2899 for (++i; i != e; ++i) { 2900 int m = Mask[i]; 2901 if (m < 0) 2902 continue; 2903 2904 Unary = Unary && (m < (int)e); 2905 NeedsUnary = NeedsUnary || (m < s); 2906 2907 if (NeedsUnary && !Unary) 2908 return false; 2909 if (Unary && m != ((s+i) & (e-1))) 2910 return false; 2911 if (!Unary && m != (s+i)) 2912 return false; 2913 } 2914 return true; 2915} 2916 2917bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2918 SmallVector<int, 8> M; 2919 N->getMask(M); 2920 return ::isPALIGNRMask(M, N->getValueType(0), true); 2921} 2922 2923/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2924/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2925static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2926 int NumElems = VT.getVectorNumElements(); 2927 if (NumElems != 2 && NumElems != 4) 2928 return false; 2929 2930 int Half = NumElems / 2; 2931 for (int i = 0; i < Half; ++i) 2932 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2933 return false; 2934 for (int i = Half; i < NumElems; ++i) 2935 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2936 return false; 2937 2938 return true; 2939} 2940 2941bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2942 SmallVector<int, 8> M; 2943 N->getMask(M); 2944 return ::isSHUFPMask(M, N->getValueType(0)); 2945} 2946 2947/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2948/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2949/// half elements to come from vector 1 (which would equal the dest.) and 2950/// the upper half to come from vector 2. 2951static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2952 int NumElems = VT.getVectorNumElements(); 2953 2954 if (NumElems != 2 && NumElems != 4) 2955 return false; 2956 2957 int Half = NumElems / 2; 2958 for (int i = 0; i < Half; ++i) 2959 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2960 return false; 2961 for (int i = Half; i < NumElems; ++i) 2962 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2963 return false; 2964 return true; 2965} 2966 2967static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2968 SmallVector<int, 8> M; 2969 N->getMask(M); 2970 return isCommutedSHUFPMask(M, N->getValueType(0)); 2971} 2972 2973/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2974/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2975bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2976 if (N->getValueType(0).getVectorNumElements() != 4) 2977 return false; 2978 2979 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2980 return isUndefOrEqual(N->getMaskElt(0), 6) && 2981 isUndefOrEqual(N->getMaskElt(1), 7) && 2982 isUndefOrEqual(N->getMaskElt(2), 2) && 2983 isUndefOrEqual(N->getMaskElt(3), 3); 2984} 2985 2986/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2987/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2988/// <2, 3, 2, 3> 2989bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2990 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2991 2992 if (NumElems != 4) 2993 return false; 2994 2995 return isUndefOrEqual(N->getMaskElt(0), 2) && 2996 isUndefOrEqual(N->getMaskElt(1), 3) && 2997 isUndefOrEqual(N->getMaskElt(2), 2) && 2998 isUndefOrEqual(N->getMaskElt(3), 3); 2999} 3000 3001/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3002/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3003bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3004 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3005 3006 if (NumElems != 2 && NumElems != 4) 3007 return false; 3008 3009 for (unsigned i = 0; i < NumElems/2; ++i) 3010 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3011 return false; 3012 3013 for (unsigned i = NumElems/2; i < NumElems; ++i) 3014 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3015 return false; 3016 3017 return true; 3018} 3019 3020/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3021/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3022bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3023 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3024 3025 if (NumElems != 2 && NumElems != 4) 3026 return false; 3027 3028 for (unsigned i = 0; i < NumElems/2; ++i) 3029 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3030 return false; 3031 3032 for (unsigned i = 0; i < NumElems/2; ++i) 3033 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3034 return false; 3035 3036 return true; 3037} 3038 3039/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3040/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3041static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3042 bool V2IsSplat = false) { 3043 int NumElts = VT.getVectorNumElements(); 3044 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3045 return false; 3046 3047 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3048 int BitI = Mask[i]; 3049 int BitI1 = Mask[i+1]; 3050 if (!isUndefOrEqual(BitI, j)) 3051 return false; 3052 if (V2IsSplat) { 3053 if (!isUndefOrEqual(BitI1, NumElts)) 3054 return false; 3055 } else { 3056 if (!isUndefOrEqual(BitI1, j + NumElts)) 3057 return false; 3058 } 3059 } 3060 return true; 3061} 3062 3063bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3064 SmallVector<int, 8> M; 3065 N->getMask(M); 3066 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3067} 3068 3069/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3070/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3071static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3072 bool V2IsSplat = false) { 3073 int NumElts = VT.getVectorNumElements(); 3074 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3075 return false; 3076 3077 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3078 int BitI = Mask[i]; 3079 int BitI1 = Mask[i+1]; 3080 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3081 return false; 3082 if (V2IsSplat) { 3083 if (isUndefOrEqual(BitI1, NumElts)) 3084 return false; 3085 } else { 3086 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3087 return false; 3088 } 3089 } 3090 return true; 3091} 3092 3093bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3094 SmallVector<int, 8> M; 3095 N->getMask(M); 3096 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3097} 3098 3099/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3100/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3101/// <0, 0, 1, 1> 3102static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3103 int NumElems = VT.getVectorNumElements(); 3104 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3105 return false; 3106 3107 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3108 int BitI = Mask[i]; 3109 int BitI1 = Mask[i+1]; 3110 if (!isUndefOrEqual(BitI, j)) 3111 return false; 3112 if (!isUndefOrEqual(BitI1, j)) 3113 return false; 3114 } 3115 return true; 3116} 3117 3118bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3119 SmallVector<int, 8> M; 3120 N->getMask(M); 3121 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3122} 3123 3124/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3125/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3126/// <2, 2, 3, 3> 3127static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3128 int NumElems = VT.getVectorNumElements(); 3129 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3130 return false; 3131 3132 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3133 int BitI = Mask[i]; 3134 int BitI1 = Mask[i+1]; 3135 if (!isUndefOrEqual(BitI, j)) 3136 return false; 3137 if (!isUndefOrEqual(BitI1, j)) 3138 return false; 3139 } 3140 return true; 3141} 3142 3143bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3144 SmallVector<int, 8> M; 3145 N->getMask(M); 3146 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3147} 3148 3149/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3150/// specifies a shuffle of elements that is suitable for input to MOVSS, 3151/// MOVSD, and MOVD, i.e. setting the lowest element. 3152static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3153 if (VT.getVectorElementType().getSizeInBits() < 32) 3154 return false; 3155 3156 int NumElts = VT.getVectorNumElements(); 3157 3158 if (!isUndefOrEqual(Mask[0], NumElts)) 3159 return false; 3160 3161 for (int i = 1; i < NumElts; ++i) 3162 if (!isUndefOrEqual(Mask[i], i)) 3163 return false; 3164 3165 return true; 3166} 3167 3168bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3169 SmallVector<int, 8> M; 3170 N->getMask(M); 3171 return ::isMOVLMask(M, N->getValueType(0)); 3172} 3173 3174/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3175/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3176/// element of vector 2 and the other elements to come from vector 1 in order. 3177static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3178 bool V2IsSplat = false, bool V2IsUndef = false) { 3179 int NumOps = VT.getVectorNumElements(); 3180 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3181 return false; 3182 3183 if (!isUndefOrEqual(Mask[0], 0)) 3184 return false; 3185 3186 for (int i = 1; i < NumOps; ++i) 3187 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3188 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3189 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3190 return false; 3191 3192 return true; 3193} 3194 3195static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3196 bool V2IsUndef = false) { 3197 SmallVector<int, 8> M; 3198 N->getMask(M); 3199 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3200} 3201 3202/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3203/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3204bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3205 if (N->getValueType(0).getVectorNumElements() != 4) 3206 return false; 3207 3208 // Expect 1, 1, 3, 3 3209 for (unsigned i = 0; i < 2; ++i) { 3210 int Elt = N->getMaskElt(i); 3211 if (Elt >= 0 && Elt != 1) 3212 return false; 3213 } 3214 3215 bool HasHi = false; 3216 for (unsigned i = 2; i < 4; ++i) { 3217 int Elt = N->getMaskElt(i); 3218 if (Elt >= 0 && Elt != 3) 3219 return false; 3220 if (Elt == 3) 3221 HasHi = true; 3222 } 3223 // Don't use movshdup if it can be done with a shufps. 3224 // FIXME: verify that matching u, u, 3, 3 is what we want. 3225 return HasHi; 3226} 3227 3228/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3229/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3230bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3231 if (N->getValueType(0).getVectorNumElements() != 4) 3232 return false; 3233 3234 // Expect 0, 0, 2, 2 3235 for (unsigned i = 0; i < 2; ++i) 3236 if (N->getMaskElt(i) > 0) 3237 return false; 3238 3239 bool HasHi = false; 3240 for (unsigned i = 2; i < 4; ++i) { 3241 int Elt = N->getMaskElt(i); 3242 if (Elt >= 0 && Elt != 2) 3243 return false; 3244 if (Elt == 2) 3245 HasHi = true; 3246 } 3247 // Don't use movsldup if it can be done with a shufps. 3248 return HasHi; 3249} 3250 3251/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3252/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3253bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3254 int e = N->getValueType(0).getVectorNumElements() / 2; 3255 3256 for (int i = 0; i < e; ++i) 3257 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3258 return false; 3259 for (int i = 0; i < e; ++i) 3260 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3261 return false; 3262 return true; 3263} 3264 3265/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3266/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3267unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3268 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3269 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3270 3271 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3272 unsigned Mask = 0; 3273 for (int i = 0; i < NumOperands; ++i) { 3274 int Val = SVOp->getMaskElt(NumOperands-i-1); 3275 if (Val < 0) Val = 0; 3276 if (Val >= NumOperands) Val -= NumOperands; 3277 Mask |= Val; 3278 if (i != NumOperands - 1) 3279 Mask <<= Shift; 3280 } 3281 return Mask; 3282} 3283 3284/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3285/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3286unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3287 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3288 unsigned Mask = 0; 3289 // 8 nodes, but we only care about the last 4. 3290 for (unsigned i = 7; i >= 4; --i) { 3291 int Val = SVOp->getMaskElt(i); 3292 if (Val >= 0) 3293 Mask |= (Val - 4); 3294 if (i != 4) 3295 Mask <<= 2; 3296 } 3297 return Mask; 3298} 3299 3300/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3301/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3302unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3303 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3304 unsigned Mask = 0; 3305 // 8 nodes, but we only care about the first 4. 3306 for (int i = 3; i >= 0; --i) { 3307 int Val = SVOp->getMaskElt(i); 3308 if (Val >= 0) 3309 Mask |= Val; 3310 if (i != 0) 3311 Mask <<= 2; 3312 } 3313 return Mask; 3314} 3315 3316/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3317/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3318unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3319 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3320 EVT VVT = N->getValueType(0); 3321 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3322 int Val = 0; 3323 3324 unsigned i, e; 3325 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3326 Val = SVOp->getMaskElt(i); 3327 if (Val >= 0) 3328 break; 3329 } 3330 return (Val - i) * EltSize; 3331} 3332 3333/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3334/// constant +0.0. 3335bool X86::isZeroNode(SDValue Elt) { 3336 return ((isa<ConstantSDNode>(Elt) && 3337 cast<ConstantSDNode>(Elt)->isNullValue()) || 3338 (isa<ConstantFPSDNode>(Elt) && 3339 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3340} 3341 3342/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3343/// their permute mask. 3344static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3345 SelectionDAG &DAG) { 3346 EVT VT = SVOp->getValueType(0); 3347 unsigned NumElems = VT.getVectorNumElements(); 3348 SmallVector<int, 8> MaskVec; 3349 3350 for (unsigned i = 0; i != NumElems; ++i) { 3351 int idx = SVOp->getMaskElt(i); 3352 if (idx < 0) 3353 MaskVec.push_back(idx); 3354 else if (idx < (int)NumElems) 3355 MaskVec.push_back(idx + NumElems); 3356 else 3357 MaskVec.push_back(idx - NumElems); 3358 } 3359 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3360 SVOp->getOperand(0), &MaskVec[0]); 3361} 3362 3363/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3364/// the two vector operands have swapped position. 3365static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3366 unsigned NumElems = VT.getVectorNumElements(); 3367 for (unsigned i = 0; i != NumElems; ++i) { 3368 int idx = Mask[i]; 3369 if (idx < 0) 3370 continue; 3371 else if (idx < (int)NumElems) 3372 Mask[i] = idx + NumElems; 3373 else 3374 Mask[i] = idx - NumElems; 3375 } 3376} 3377 3378/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3379/// match movhlps. The lower half elements should come from upper half of 3380/// V1 (and in order), and the upper half elements should come from the upper 3381/// half of V2 (and in order). 3382static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3383 if (Op->getValueType(0).getVectorNumElements() != 4) 3384 return false; 3385 for (unsigned i = 0, e = 2; i != e; ++i) 3386 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3387 return false; 3388 for (unsigned i = 2; i != 4; ++i) 3389 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3390 return false; 3391 return true; 3392} 3393 3394/// isScalarLoadToVector - Returns true if the node is a scalar load that 3395/// is promoted to a vector. It also returns the LoadSDNode by reference if 3396/// required. 3397static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3398 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3399 return false; 3400 N = N->getOperand(0).getNode(); 3401 if (!ISD::isNON_EXTLoad(N)) 3402 return false; 3403 if (LD) 3404 *LD = cast<LoadSDNode>(N); 3405 return true; 3406} 3407 3408/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3409/// match movlp{s|d}. The lower half elements should come from lower half of 3410/// V1 (and in order), and the upper half elements should come from the upper 3411/// half of V2 (and in order). And since V1 will become the source of the 3412/// MOVLP, it must be either a vector load or a scalar load to vector. 3413static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3414 ShuffleVectorSDNode *Op) { 3415 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3416 return false; 3417 // Is V2 is a vector load, don't do this transformation. We will try to use 3418 // load folding shufps op. 3419 if (ISD::isNON_EXTLoad(V2)) 3420 return false; 3421 3422 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3423 3424 if (NumElems != 2 && NumElems != 4) 3425 return false; 3426 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3427 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3428 return false; 3429 for (unsigned i = NumElems/2; i != NumElems; ++i) 3430 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3431 return false; 3432 return true; 3433} 3434 3435/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3436/// all the same. 3437static bool isSplatVector(SDNode *N) { 3438 if (N->getOpcode() != ISD::BUILD_VECTOR) 3439 return false; 3440 3441 SDValue SplatValue = N->getOperand(0); 3442 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3443 if (N->getOperand(i) != SplatValue) 3444 return false; 3445 return true; 3446} 3447 3448/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3449/// to an zero vector. 3450/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3451static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3452 SDValue V1 = N->getOperand(0); 3453 SDValue V2 = N->getOperand(1); 3454 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3455 for (unsigned i = 0; i != NumElems; ++i) { 3456 int Idx = N->getMaskElt(i); 3457 if (Idx >= (int)NumElems) { 3458 unsigned Opc = V2.getOpcode(); 3459 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3460 continue; 3461 if (Opc != ISD::BUILD_VECTOR || 3462 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3463 return false; 3464 } else if (Idx >= 0) { 3465 unsigned Opc = V1.getOpcode(); 3466 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3467 continue; 3468 if (Opc != ISD::BUILD_VECTOR || 3469 !X86::isZeroNode(V1.getOperand(Idx))) 3470 return false; 3471 } 3472 } 3473 return true; 3474} 3475 3476/// getZeroVector - Returns a vector of specified type with all zero elements. 3477/// 3478static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3479 DebugLoc dl) { 3480 assert(VT.isVector() && "Expected a vector type"); 3481 3482 // Always build SSE zero vectors as <4 x i32> bitcasted 3483 // to their dest type. This ensures they get CSE'd. 3484 SDValue Vec; 3485 if (VT.getSizeInBits() == 128) { // SSE 3486 if (HasSSE2) { // SSE2 3487 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3488 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3489 } else { // SSE1 3490 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3491 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3492 } 3493 } else if (VT.getSizeInBits() == 256) { // AVX 3494 // 256-bit logic and arithmetic instructions in AVX are 3495 // all floating-point, no support for integer ops. Default 3496 // to emitting fp zeroed vectors then. 3497 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3498 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3499 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3500 } 3501 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3502} 3503 3504/// getOnesVector - Returns a vector of specified type with all bits set. 3505/// 3506static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3507 assert(VT.isVector() && "Expected a vector type"); 3508 3509 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3510 // type. This ensures they get CSE'd. 3511 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3512 SDValue Vec; 3513 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3514 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3515} 3516 3517 3518/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3519/// that point to V2 points to its first element. 3520static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3521 EVT VT = SVOp->getValueType(0); 3522 unsigned NumElems = VT.getVectorNumElements(); 3523 3524 bool Changed = false; 3525 SmallVector<int, 8> MaskVec; 3526 SVOp->getMask(MaskVec); 3527 3528 for (unsigned i = 0; i != NumElems; ++i) { 3529 if (MaskVec[i] > (int)NumElems) { 3530 MaskVec[i] = NumElems; 3531 Changed = true; 3532 } 3533 } 3534 if (Changed) 3535 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3536 SVOp->getOperand(1), &MaskVec[0]); 3537 return SDValue(SVOp, 0); 3538} 3539 3540/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3541/// operation of specified width. 3542static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3543 SDValue V2) { 3544 unsigned NumElems = VT.getVectorNumElements(); 3545 SmallVector<int, 8> Mask; 3546 Mask.push_back(NumElems); 3547 for (unsigned i = 1; i != NumElems; ++i) 3548 Mask.push_back(i); 3549 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3550} 3551 3552/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3553static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3554 SDValue V2) { 3555 unsigned NumElems = VT.getVectorNumElements(); 3556 SmallVector<int, 8> Mask; 3557 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3558 Mask.push_back(i); 3559 Mask.push_back(i + NumElems); 3560 } 3561 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3562} 3563 3564/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3565static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3566 SDValue V2) { 3567 unsigned NumElems = VT.getVectorNumElements(); 3568 unsigned Half = NumElems/2; 3569 SmallVector<int, 8> Mask; 3570 for (unsigned i = 0; i != Half; ++i) { 3571 Mask.push_back(i + Half); 3572 Mask.push_back(i + NumElems + Half); 3573 } 3574 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3575} 3576 3577/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3578static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3579 EVT PVT = MVT::v4f32; 3580 EVT VT = SV->getValueType(0); 3581 DebugLoc dl = SV->getDebugLoc(); 3582 SDValue V1 = SV->getOperand(0); 3583 int NumElems = VT.getVectorNumElements(); 3584 int EltNo = SV->getSplatIndex(); 3585 3586 // unpack elements to the correct location 3587 while (NumElems > 4) { 3588 if (EltNo < NumElems/2) { 3589 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3590 } else { 3591 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3592 EltNo -= NumElems/2; 3593 } 3594 NumElems >>= 1; 3595 } 3596 3597 // Perform the splat. 3598 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3599 V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1); 3600 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3601 return DAG.getNode(ISD::BITCAST, dl, VT, V1); 3602} 3603 3604/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3605/// vector of zero or undef vector. This produces a shuffle where the low 3606/// element of V2 is swizzled into the zero/undef vector, landing at element 3607/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3608static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3609 bool isZero, bool HasSSE2, 3610 SelectionDAG &DAG) { 3611 EVT VT = V2.getValueType(); 3612 SDValue V1 = isZero 3613 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3614 unsigned NumElems = VT.getVectorNumElements(); 3615 SmallVector<int, 16> MaskVec; 3616 for (unsigned i = 0; i != NumElems; ++i) 3617 // If this is the insertion idx, put the low elt of V2 here. 3618 MaskVec.push_back(i == Idx ? NumElems : i); 3619 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3620} 3621 3622/// getShuffleScalarElt - Returns the scalar element that will make up the ith 3623/// element of the result of the vector shuffle. 3624SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 3625 unsigned Depth) { 3626 if (Depth == 6) 3627 return SDValue(); // Limit search depth. 3628 3629 SDValue V = SDValue(N, 0); 3630 EVT VT = V.getValueType(); 3631 unsigned Opcode = V.getOpcode(); 3632 3633 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 3634 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 3635 Index = SV->getMaskElt(Index); 3636 3637 if (Index < 0) 3638 return DAG.getUNDEF(VT.getVectorElementType()); 3639 3640 int NumElems = VT.getVectorNumElements(); 3641 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 3642 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 3643 } 3644 3645 // Recurse into target specific vector shuffles to find scalars. 3646 if (isTargetShuffle(Opcode)) { 3647 int NumElems = VT.getVectorNumElements(); 3648 SmallVector<unsigned, 16> ShuffleMask; 3649 SDValue ImmN; 3650 3651 switch(Opcode) { 3652 case X86ISD::SHUFPS: 3653 case X86ISD::SHUFPD: 3654 ImmN = N->getOperand(N->getNumOperands()-1); 3655 DecodeSHUFPSMask(NumElems, 3656 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3657 ShuffleMask); 3658 break; 3659 case X86ISD::PUNPCKHBW: 3660 case X86ISD::PUNPCKHWD: 3661 case X86ISD::PUNPCKHDQ: 3662 case X86ISD::PUNPCKHQDQ: 3663 DecodePUNPCKHMask(NumElems, ShuffleMask); 3664 break; 3665 case X86ISD::UNPCKHPS: 3666 case X86ISD::UNPCKHPD: 3667 DecodeUNPCKHPMask(NumElems, ShuffleMask); 3668 break; 3669 case X86ISD::PUNPCKLBW: 3670 case X86ISD::PUNPCKLWD: 3671 case X86ISD::PUNPCKLDQ: 3672 case X86ISD::PUNPCKLQDQ: 3673 DecodePUNPCKLMask(NumElems, ShuffleMask); 3674 break; 3675 case X86ISD::UNPCKLPS: 3676 case X86ISD::UNPCKLPD: 3677 DecodeUNPCKLPMask(NumElems, ShuffleMask); 3678 break; 3679 case X86ISD::MOVHLPS: 3680 DecodeMOVHLPSMask(NumElems, ShuffleMask); 3681 break; 3682 case X86ISD::MOVLHPS: 3683 DecodeMOVLHPSMask(NumElems, ShuffleMask); 3684 break; 3685 case X86ISD::PSHUFD: 3686 ImmN = N->getOperand(N->getNumOperands()-1); 3687 DecodePSHUFMask(NumElems, 3688 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3689 ShuffleMask); 3690 break; 3691 case X86ISD::PSHUFHW: 3692 ImmN = N->getOperand(N->getNumOperands()-1); 3693 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3694 ShuffleMask); 3695 break; 3696 case X86ISD::PSHUFLW: 3697 ImmN = N->getOperand(N->getNumOperands()-1); 3698 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3699 ShuffleMask); 3700 break; 3701 case X86ISD::MOVSS: 3702 case X86ISD::MOVSD: { 3703 // The index 0 always comes from the first element of the second source, 3704 // this is why MOVSS and MOVSD are used in the first place. The other 3705 // elements come from the other positions of the first source vector. 3706 unsigned OpNum = (Index == 0) ? 1 : 0; 3707 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 3708 Depth+1); 3709 } 3710 default: 3711 assert("not implemented for target shuffle node"); 3712 return SDValue(); 3713 } 3714 3715 Index = ShuffleMask[Index]; 3716 if (Index < 0) 3717 return DAG.getUNDEF(VT.getVectorElementType()); 3718 3719 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 3720 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 3721 Depth+1); 3722 } 3723 3724 // Actual nodes that may contain scalar elements 3725 if (Opcode == ISD::BITCAST) { 3726 V = V.getOperand(0); 3727 EVT SrcVT = V.getValueType(); 3728 unsigned NumElems = VT.getVectorNumElements(); 3729 3730 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 3731 return SDValue(); 3732 } 3733 3734 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 3735 return (Index == 0) ? V.getOperand(0) 3736 : DAG.getUNDEF(VT.getVectorElementType()); 3737 3738 if (V.getOpcode() == ISD::BUILD_VECTOR) 3739 return V.getOperand(Index); 3740 3741 return SDValue(); 3742} 3743 3744/// getNumOfConsecutiveZeros - Return the number of elements of a vector 3745/// shuffle operation which come from a consecutively from a zero. The 3746/// search can start in two diferent directions, from left or right. 3747static 3748unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 3749 bool ZerosFromLeft, SelectionDAG &DAG) { 3750 int i = 0; 3751 3752 while (i < NumElems) { 3753 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 3754 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 3755 if (!(Elt.getNode() && 3756 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 3757 break; 3758 ++i; 3759 } 3760 3761 return i; 3762} 3763 3764/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 3765/// MaskE correspond consecutively to elements from one of the vector operands, 3766/// starting from its index OpIdx. Also tell OpNum which source vector operand. 3767static 3768bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 3769 int OpIdx, int NumElems, unsigned &OpNum) { 3770 bool SeenV1 = false; 3771 bool SeenV2 = false; 3772 3773 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 3774 int Idx = SVOp->getMaskElt(i); 3775 // Ignore undef indicies 3776 if (Idx < 0) 3777 continue; 3778 3779 if (Idx < NumElems) 3780 SeenV1 = true; 3781 else 3782 SeenV2 = true; 3783 3784 // Only accept consecutive elements from the same vector 3785 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 3786 return false; 3787 } 3788 3789 OpNum = SeenV1 ? 0 : 1; 3790 return true; 3791} 3792 3793/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 3794/// logical left shift of a vector. 3795static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3796 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3797 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3798 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3799 false /* check zeros from right */, DAG); 3800 unsigned OpSrc; 3801 3802 if (!NumZeros) 3803 return false; 3804 3805 // Considering the elements in the mask that are not consecutive zeros, 3806 // check if they consecutively come from only one of the source vectors. 3807 // 3808 // V1 = {X, A, B, C} 0 3809 // \ \ \ / 3810 // vector_shuffle V1, V2 <1, 2, 3, X> 3811 // 3812 if (!isShuffleMaskConsecutive(SVOp, 3813 0, // Mask Start Index 3814 NumElems-NumZeros-1, // Mask End Index 3815 NumZeros, // Where to start looking in the src vector 3816 NumElems, // Number of elements in vector 3817 OpSrc)) // Which source operand ? 3818 return false; 3819 3820 isLeft = false; 3821 ShAmt = NumZeros; 3822 ShVal = SVOp->getOperand(OpSrc); 3823 return true; 3824} 3825 3826/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 3827/// logical left shift of a vector. 3828static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3829 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3830 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3831 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3832 true /* check zeros from left */, DAG); 3833 unsigned OpSrc; 3834 3835 if (!NumZeros) 3836 return false; 3837 3838 // Considering the elements in the mask that are not consecutive zeros, 3839 // check if they consecutively come from only one of the source vectors. 3840 // 3841 // 0 { A, B, X, X } = V2 3842 // / \ / / 3843 // vector_shuffle V1, V2 <X, X, 4, 5> 3844 // 3845 if (!isShuffleMaskConsecutive(SVOp, 3846 NumZeros, // Mask Start Index 3847 NumElems-1, // Mask End Index 3848 0, // Where to start looking in the src vector 3849 NumElems, // Number of elements in vector 3850 OpSrc)) // Which source operand ? 3851 return false; 3852 3853 isLeft = true; 3854 ShAmt = NumZeros; 3855 ShVal = SVOp->getOperand(OpSrc); 3856 return true; 3857} 3858 3859/// isVectorShift - Returns true if the shuffle can be implemented as a 3860/// logical left or right shift of a vector. 3861static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3862 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3863 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 3864 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 3865 return true; 3866 3867 return false; 3868} 3869 3870/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3871/// 3872static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3873 unsigned NumNonZero, unsigned NumZero, 3874 SelectionDAG &DAG, 3875 const TargetLowering &TLI) { 3876 if (NumNonZero > 8) 3877 return SDValue(); 3878 3879 DebugLoc dl = Op.getDebugLoc(); 3880 SDValue V(0, 0); 3881 bool First = true; 3882 for (unsigned i = 0; i < 16; ++i) { 3883 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3884 if (ThisIsNonZero && First) { 3885 if (NumZero) 3886 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3887 else 3888 V = DAG.getUNDEF(MVT::v8i16); 3889 First = false; 3890 } 3891 3892 if ((i & 1) != 0) { 3893 SDValue ThisElt(0, 0), LastElt(0, 0); 3894 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3895 if (LastIsNonZero) { 3896 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3897 MVT::i16, Op.getOperand(i-1)); 3898 } 3899 if (ThisIsNonZero) { 3900 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3901 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3902 ThisElt, DAG.getConstant(8, MVT::i8)); 3903 if (LastIsNonZero) 3904 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3905 } else 3906 ThisElt = LastElt; 3907 3908 if (ThisElt.getNode()) 3909 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3910 DAG.getIntPtrConstant(i/2)); 3911 } 3912 } 3913 3914 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 3915} 3916 3917/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3918/// 3919static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3920 unsigned NumNonZero, unsigned NumZero, 3921 SelectionDAG &DAG, 3922 const TargetLowering &TLI) { 3923 if (NumNonZero > 4) 3924 return SDValue(); 3925 3926 DebugLoc dl = Op.getDebugLoc(); 3927 SDValue V(0, 0); 3928 bool First = true; 3929 for (unsigned i = 0; i < 8; ++i) { 3930 bool isNonZero = (NonZeros & (1 << i)) != 0; 3931 if (isNonZero) { 3932 if (First) { 3933 if (NumZero) 3934 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3935 else 3936 V = DAG.getUNDEF(MVT::v8i16); 3937 First = false; 3938 } 3939 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3940 MVT::v8i16, V, Op.getOperand(i), 3941 DAG.getIntPtrConstant(i)); 3942 } 3943 } 3944 3945 return V; 3946} 3947 3948/// getVShift - Return a vector logical shift node. 3949/// 3950static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3951 unsigned NumBits, SelectionDAG &DAG, 3952 const TargetLowering &TLI, DebugLoc dl) { 3953 EVT ShVT = MVT::v2i64; 3954 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3955 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 3956 return DAG.getNode(ISD::BITCAST, dl, VT, 3957 DAG.getNode(Opc, dl, ShVT, SrcOp, 3958 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3959} 3960 3961SDValue 3962X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3963 SelectionDAG &DAG) const { 3964 3965 // Check if the scalar load can be widened into a vector load. And if 3966 // the address is "base + cst" see if the cst can be "absorbed" into 3967 // the shuffle mask. 3968 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3969 SDValue Ptr = LD->getBasePtr(); 3970 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3971 return SDValue(); 3972 EVT PVT = LD->getValueType(0); 3973 if (PVT != MVT::i32 && PVT != MVT::f32) 3974 return SDValue(); 3975 3976 int FI = -1; 3977 int64_t Offset = 0; 3978 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3979 FI = FINode->getIndex(); 3980 Offset = 0; 3981 } else if (Ptr.getOpcode() == ISD::ADD && 3982 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3983 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3984 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3985 Offset = Ptr.getConstantOperandVal(1); 3986 Ptr = Ptr.getOperand(0); 3987 } else { 3988 return SDValue(); 3989 } 3990 3991 SDValue Chain = LD->getChain(); 3992 // Make sure the stack object alignment is at least 16. 3993 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3994 if (DAG.InferPtrAlignment(Ptr) < 16) { 3995 if (MFI->isFixedObjectIndex(FI)) { 3996 // Can't change the alignment. FIXME: It's possible to compute 3997 // the exact stack offset and reference FI + adjust offset instead. 3998 // If someone *really* cares about this. That's the way to implement it. 3999 return SDValue(); 4000 } else { 4001 MFI->setObjectAlignment(FI, 16); 4002 } 4003 } 4004 4005 // (Offset % 16) must be multiple of 4. Then address is then 4006 // Ptr + (Offset & ~15). 4007 if (Offset < 0) 4008 return SDValue(); 4009 if ((Offset % 16) & 3) 4010 return SDValue(); 4011 int64_t StartOffset = Offset & ~15; 4012 if (StartOffset) 4013 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4014 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4015 4016 int EltNo = (Offset - StartOffset) >> 2; 4017 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 4018 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 4019 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, 4020 LD->getPointerInfo().getWithOffset(StartOffset), 4021 false, false, 0); 4022 // Canonicalize it to a v4i32 shuffle. 4023 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 4024 return DAG.getNode(ISD::BITCAST, dl, VT, 4025 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 4026 DAG.getUNDEF(MVT::v4i32),&Mask[0])); 4027 } 4028 4029 return SDValue(); 4030} 4031 4032/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4033/// vector of type 'VT', see if the elements can be replaced by a single large 4034/// load which has the same value as a build_vector whose operands are 'elts'. 4035/// 4036/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4037/// 4038/// FIXME: we'd also like to handle the case where the last elements are zero 4039/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4040/// There's even a handy isZeroNode for that purpose. 4041static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4042 DebugLoc &DL, SelectionDAG &DAG) { 4043 EVT EltVT = VT.getVectorElementType(); 4044 unsigned NumElems = Elts.size(); 4045 4046 LoadSDNode *LDBase = NULL; 4047 unsigned LastLoadedElt = -1U; 4048 4049 // For each element in the initializer, see if we've found a load or an undef. 4050 // If we don't find an initial load element, or later load elements are 4051 // non-consecutive, bail out. 4052 for (unsigned i = 0; i < NumElems; ++i) { 4053 SDValue Elt = Elts[i]; 4054 4055 if (!Elt.getNode() || 4056 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4057 return SDValue(); 4058 if (!LDBase) { 4059 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4060 return SDValue(); 4061 LDBase = cast<LoadSDNode>(Elt.getNode()); 4062 LastLoadedElt = i; 4063 continue; 4064 } 4065 if (Elt.getOpcode() == ISD::UNDEF) 4066 continue; 4067 4068 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4069 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4070 return SDValue(); 4071 LastLoadedElt = i; 4072 } 4073 4074 // If we have found an entire vector of loads and undefs, then return a large 4075 // load of the entire vector width starting at the base pointer. If we found 4076 // consecutive loads for the low half, generate a vzext_load node. 4077 if (LastLoadedElt == NumElems - 1) { 4078 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4079 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4080 LDBase->getPointerInfo(), 4081 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4082 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4083 LDBase->getPointerInfo(), 4084 LDBase->isVolatile(), LDBase->isNonTemporal(), 4085 LDBase->getAlignment()); 4086 } else if (NumElems == 4 && LastLoadedElt == 1) { 4087 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4088 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4089 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4090 Ops, 2, MVT::i32, 4091 LDBase->getMemOperand()); 4092 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4093 } 4094 return SDValue(); 4095} 4096 4097SDValue 4098X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4099 DebugLoc dl = Op.getDebugLoc(); 4100 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1. 4101 // All one's are handled with pcmpeqd. In AVX, zero's are handled with 4102 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 4103 // is present, so AllOnes is ignored. 4104 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 4105 (Op.getValueType().getSizeInBits() != 256 && 4106 ISD::isBuildVectorAllOnes(Op.getNode()))) { 4107 // Canonicalize this to <4 x i32> (SSE) to 4108 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 4109 // eliminated on x86-32 hosts. 4110 if (Op.getValueType() == MVT::v4i32) 4111 return Op; 4112 4113 if (ISD::isBuildVectorAllOnes(Op.getNode())) 4114 return getOnesVector(Op.getValueType(), DAG, dl); 4115 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4116 } 4117 4118 EVT VT = Op.getValueType(); 4119 EVT ExtVT = VT.getVectorElementType(); 4120 unsigned EVTBits = ExtVT.getSizeInBits(); 4121 4122 unsigned NumElems = Op.getNumOperands(); 4123 unsigned NumZero = 0; 4124 unsigned NumNonZero = 0; 4125 unsigned NonZeros = 0; 4126 bool IsAllConstants = true; 4127 SmallSet<SDValue, 8> Values; 4128 for (unsigned i = 0; i < NumElems; ++i) { 4129 SDValue Elt = Op.getOperand(i); 4130 if (Elt.getOpcode() == ISD::UNDEF) 4131 continue; 4132 Values.insert(Elt); 4133 if (Elt.getOpcode() != ISD::Constant && 4134 Elt.getOpcode() != ISD::ConstantFP) 4135 IsAllConstants = false; 4136 if (X86::isZeroNode(Elt)) 4137 NumZero++; 4138 else { 4139 NonZeros |= (1 << i); 4140 NumNonZero++; 4141 } 4142 } 4143 4144 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4145 if (NumNonZero == 0) 4146 return DAG.getUNDEF(VT); 4147 4148 // Special case for single non-zero, non-undef, element. 4149 if (NumNonZero == 1) { 4150 unsigned Idx = CountTrailingZeros_32(NonZeros); 4151 SDValue Item = Op.getOperand(Idx); 4152 4153 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4154 // the value are obviously zero, truncate the value to i32 and do the 4155 // insertion that way. Only do this if the value is non-constant or if the 4156 // value is a constant being inserted into element 0. It is cheaper to do 4157 // a constant pool load than it is to do a movd + shuffle. 4158 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4159 (!IsAllConstants || Idx == 0)) { 4160 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4161 // Handle SSE only. 4162 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4163 EVT VecVT = MVT::v4i32; 4164 unsigned VecElts = 4; 4165 4166 // Truncate the value (which may itself be a constant) to i32, and 4167 // convert it to a vector with movd (S2V+shuffle to zero extend). 4168 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4169 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4170 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4171 Subtarget->hasSSE2(), DAG); 4172 4173 // Now we have our 32-bit value zero extended in the low element of 4174 // a vector. If Idx != 0, swizzle it into place. 4175 if (Idx != 0) { 4176 SmallVector<int, 4> Mask; 4177 Mask.push_back(Idx); 4178 for (unsigned i = 1; i != VecElts; ++i) 4179 Mask.push_back(i); 4180 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4181 DAG.getUNDEF(Item.getValueType()), 4182 &Mask[0]); 4183 } 4184 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 4185 } 4186 } 4187 4188 // If we have a constant or non-constant insertion into the low element of 4189 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4190 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4191 // depending on what the source datatype is. 4192 if (Idx == 0) { 4193 if (NumZero == 0) { 4194 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4195 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4196 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4197 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4198 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4199 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4200 DAG); 4201 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4202 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4203 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4204 EVT MiddleVT = MVT::v4i32; 4205 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4206 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4207 Subtarget->hasSSE2(), DAG); 4208 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 4209 } 4210 } 4211 4212 // Is it a vector logical left shift? 4213 if (NumElems == 2 && Idx == 1 && 4214 X86::isZeroNode(Op.getOperand(0)) && 4215 !X86::isZeroNode(Op.getOperand(1))) { 4216 unsigned NumBits = VT.getSizeInBits(); 4217 return getVShift(true, VT, 4218 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4219 VT, Op.getOperand(1)), 4220 NumBits/2, DAG, *this, dl); 4221 } 4222 4223 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4224 return SDValue(); 4225 4226 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4227 // is a non-constant being inserted into an element other than the low one, 4228 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4229 // movd/movss) to move this into the low element, then shuffle it into 4230 // place. 4231 if (EVTBits == 32) { 4232 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4233 4234 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4235 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4236 Subtarget->hasSSE2(), DAG); 4237 SmallVector<int, 8> MaskVec; 4238 for (unsigned i = 0; i < NumElems; i++) 4239 MaskVec.push_back(i == Idx ? 0 : 1); 4240 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4241 } 4242 } 4243 4244 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4245 if (Values.size() == 1) { 4246 if (EVTBits == 32) { 4247 // Instead of a shuffle like this: 4248 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4249 // Check if it's possible to issue this instead. 4250 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4251 unsigned Idx = CountTrailingZeros_32(NonZeros); 4252 SDValue Item = Op.getOperand(Idx); 4253 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4254 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4255 } 4256 return SDValue(); 4257 } 4258 4259 // A vector full of immediates; various special cases are already 4260 // handled, so this is best done with a single constant-pool load. 4261 if (IsAllConstants) 4262 return SDValue(); 4263 4264 // Let legalizer expand 2-wide build_vectors. 4265 if (EVTBits == 64) { 4266 if (NumNonZero == 1) { 4267 // One half is zero or undef. 4268 unsigned Idx = CountTrailingZeros_32(NonZeros); 4269 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4270 Op.getOperand(Idx)); 4271 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4272 Subtarget->hasSSE2(), DAG); 4273 } 4274 return SDValue(); 4275 } 4276 4277 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4278 if (EVTBits == 8 && NumElems == 16) { 4279 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4280 *this); 4281 if (V.getNode()) return V; 4282 } 4283 4284 if (EVTBits == 16 && NumElems == 8) { 4285 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4286 *this); 4287 if (V.getNode()) return V; 4288 } 4289 4290 // If element VT is == 32 bits, turn it into a number of shuffles. 4291 SmallVector<SDValue, 8> V; 4292 V.resize(NumElems); 4293 if (NumElems == 4 && NumZero > 0) { 4294 for (unsigned i = 0; i < 4; ++i) { 4295 bool isZero = !(NonZeros & (1 << i)); 4296 if (isZero) 4297 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4298 else 4299 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4300 } 4301 4302 for (unsigned i = 0; i < 2; ++i) { 4303 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4304 default: break; 4305 case 0: 4306 V[i] = V[i*2]; // Must be a zero vector. 4307 break; 4308 case 1: 4309 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4310 break; 4311 case 2: 4312 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4313 break; 4314 case 3: 4315 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4316 break; 4317 } 4318 } 4319 4320 SmallVector<int, 8> MaskVec; 4321 bool Reverse = (NonZeros & 0x3) == 2; 4322 for (unsigned i = 0; i < 2; ++i) 4323 MaskVec.push_back(Reverse ? 1-i : i); 4324 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4325 for (unsigned i = 0; i < 2; ++i) 4326 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4327 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4328 } 4329 4330 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4331 // Check for a build vector of consecutive loads. 4332 for (unsigned i = 0; i < NumElems; ++i) 4333 V[i] = Op.getOperand(i); 4334 4335 // Check for elements which are consecutive loads. 4336 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4337 if (LD.getNode()) 4338 return LD; 4339 4340 // For SSE 4.1, use insertps to put the high elements into the low element. 4341 if (getSubtarget()->hasSSE41()) { 4342 SDValue Result; 4343 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4344 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4345 else 4346 Result = DAG.getUNDEF(VT); 4347 4348 for (unsigned i = 1; i < NumElems; ++i) { 4349 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4350 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4351 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4352 } 4353 return Result; 4354 } 4355 4356 // Otherwise, expand into a number of unpckl*, start by extending each of 4357 // our (non-undef) elements to the full vector width with the element in the 4358 // bottom slot of the vector (which generates no code for SSE). 4359 for (unsigned i = 0; i < NumElems; ++i) { 4360 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4361 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4362 else 4363 V[i] = DAG.getUNDEF(VT); 4364 } 4365 4366 // Next, we iteratively mix elements, e.g. for v4f32: 4367 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4368 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4369 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4370 unsigned EltStride = NumElems >> 1; 4371 while (EltStride != 0) { 4372 for (unsigned i = 0; i < EltStride; ++i) { 4373 // If V[i+EltStride] is undef and this is the first round of mixing, 4374 // then it is safe to just drop this shuffle: V[i] is already in the 4375 // right place, the one element (since it's the first round) being 4376 // inserted as undef can be dropped. This isn't safe for successive 4377 // rounds because they will permute elements within both vectors. 4378 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 4379 EltStride == NumElems/2) 4380 continue; 4381 4382 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 4383 } 4384 EltStride >>= 1; 4385 } 4386 return V[0]; 4387 } 4388 return SDValue(); 4389} 4390 4391SDValue 4392X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4393 // We support concatenate two MMX registers and place them in a MMX 4394 // register. This is better than doing a stack convert. 4395 DebugLoc dl = Op.getDebugLoc(); 4396 EVT ResVT = Op.getValueType(); 4397 assert(Op.getNumOperands() == 2); 4398 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4399 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4400 int Mask[2]; 4401 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 4402 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4403 InVec = Op.getOperand(1); 4404 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4405 unsigned NumElts = ResVT.getVectorNumElements(); 4406 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4407 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4408 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4409 } else { 4410 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 4411 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4412 Mask[0] = 0; Mask[1] = 2; 4413 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4414 } 4415 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4416} 4417 4418// v8i16 shuffles - Prefer shuffles in the following order: 4419// 1. [all] pshuflw, pshufhw, optional move 4420// 2. [ssse3] 1 x pshufb 4421// 3. [ssse3] 2 x pshufb + 1 x por 4422// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4423SDValue 4424X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4425 SelectionDAG &DAG) const { 4426 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4427 SDValue V1 = SVOp->getOperand(0); 4428 SDValue V2 = SVOp->getOperand(1); 4429 DebugLoc dl = SVOp->getDebugLoc(); 4430 SmallVector<int, 8> MaskVals; 4431 4432 // Determine if more than 1 of the words in each of the low and high quadwords 4433 // of the result come from the same quadword of one of the two inputs. Undef 4434 // mask values count as coming from any quadword, for better codegen. 4435 SmallVector<unsigned, 4> LoQuad(4); 4436 SmallVector<unsigned, 4> HiQuad(4); 4437 BitVector InputQuads(4); 4438 for (unsigned i = 0; i < 8; ++i) { 4439 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4440 int EltIdx = SVOp->getMaskElt(i); 4441 MaskVals.push_back(EltIdx); 4442 if (EltIdx < 0) { 4443 ++Quad[0]; 4444 ++Quad[1]; 4445 ++Quad[2]; 4446 ++Quad[3]; 4447 continue; 4448 } 4449 ++Quad[EltIdx / 4]; 4450 InputQuads.set(EltIdx / 4); 4451 } 4452 4453 int BestLoQuad = -1; 4454 unsigned MaxQuad = 1; 4455 for (unsigned i = 0; i < 4; ++i) { 4456 if (LoQuad[i] > MaxQuad) { 4457 BestLoQuad = i; 4458 MaxQuad = LoQuad[i]; 4459 } 4460 } 4461 4462 int BestHiQuad = -1; 4463 MaxQuad = 1; 4464 for (unsigned i = 0; i < 4; ++i) { 4465 if (HiQuad[i] > MaxQuad) { 4466 BestHiQuad = i; 4467 MaxQuad = HiQuad[i]; 4468 } 4469 } 4470 4471 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4472 // of the two input vectors, shuffle them into one input vector so only a 4473 // single pshufb instruction is necessary. If There are more than 2 input 4474 // quads, disable the next transformation since it does not help SSSE3. 4475 bool V1Used = InputQuads[0] || InputQuads[1]; 4476 bool V2Used = InputQuads[2] || InputQuads[3]; 4477 if (Subtarget->hasSSSE3()) { 4478 if (InputQuads.count() == 2 && V1Used && V2Used) { 4479 BestLoQuad = InputQuads.find_first(); 4480 BestHiQuad = InputQuads.find_next(BestLoQuad); 4481 } 4482 if (InputQuads.count() > 2) { 4483 BestLoQuad = -1; 4484 BestHiQuad = -1; 4485 } 4486 } 4487 4488 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4489 // the shuffle mask. If a quad is scored as -1, that means that it contains 4490 // words from all 4 input quadwords. 4491 SDValue NewV; 4492 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4493 SmallVector<int, 8> MaskV; 4494 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4495 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4496 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4497 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 4498 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 4499 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 4500 4501 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4502 // source words for the shuffle, to aid later transformations. 4503 bool AllWordsInNewV = true; 4504 bool InOrder[2] = { true, true }; 4505 for (unsigned i = 0; i != 8; ++i) { 4506 int idx = MaskVals[i]; 4507 if (idx != (int)i) 4508 InOrder[i/4] = false; 4509 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4510 continue; 4511 AllWordsInNewV = false; 4512 break; 4513 } 4514 4515 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4516 if (AllWordsInNewV) { 4517 for (int i = 0; i != 8; ++i) { 4518 int idx = MaskVals[i]; 4519 if (idx < 0) 4520 continue; 4521 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4522 if ((idx != i) && idx < 4) 4523 pshufhw = false; 4524 if ((idx != i) && idx > 3) 4525 pshuflw = false; 4526 } 4527 V1 = NewV; 4528 V2Used = false; 4529 BestLoQuad = 0; 4530 BestHiQuad = 1; 4531 } 4532 4533 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4534 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4535 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4536 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4537 unsigned TargetMask = 0; 4538 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4539 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4540 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4541 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4542 V1 = NewV.getOperand(0); 4543 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 4544 } 4545 } 4546 4547 // If we have SSSE3, and all words of the result are from 1 input vector, 4548 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4549 // is present, fall back to case 4. 4550 if (Subtarget->hasSSSE3()) { 4551 SmallVector<SDValue,16> pshufbMask; 4552 4553 // If we have elements from both input vectors, set the high bit of the 4554 // shuffle mask element to zero out elements that come from V2 in the V1 4555 // mask, and elements that come from V1 in the V2 mask, so that the two 4556 // results can be OR'd together. 4557 bool TwoInputs = V1Used && V2Used; 4558 for (unsigned i = 0; i != 8; ++i) { 4559 int EltIdx = MaskVals[i] * 2; 4560 if (TwoInputs && (EltIdx >= 16)) { 4561 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4562 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4563 continue; 4564 } 4565 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4566 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4567 } 4568 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 4569 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4570 DAG.getNode(ISD::BUILD_VECTOR, dl, 4571 MVT::v16i8, &pshufbMask[0], 16)); 4572 if (!TwoInputs) 4573 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4574 4575 // Calculate the shuffle mask for the second input, shuffle it, and 4576 // OR it with the first shuffled input. 4577 pshufbMask.clear(); 4578 for (unsigned i = 0; i != 8; ++i) { 4579 int EltIdx = MaskVals[i] * 2; 4580 if (EltIdx < 16) { 4581 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4582 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4583 continue; 4584 } 4585 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4586 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4587 } 4588 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 4589 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4590 DAG.getNode(ISD::BUILD_VECTOR, dl, 4591 MVT::v16i8, &pshufbMask[0], 16)); 4592 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4593 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4594 } 4595 4596 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4597 // and update MaskVals with new element order. 4598 BitVector InOrder(8); 4599 if (BestLoQuad >= 0) { 4600 SmallVector<int, 8> MaskV; 4601 for (int i = 0; i != 4; ++i) { 4602 int idx = MaskVals[i]; 4603 if (idx < 0) { 4604 MaskV.push_back(-1); 4605 InOrder.set(i); 4606 } else if ((idx / 4) == BestLoQuad) { 4607 MaskV.push_back(idx & 3); 4608 InOrder.set(i); 4609 } else { 4610 MaskV.push_back(-1); 4611 } 4612 } 4613 for (unsigned i = 4; i != 8; ++i) 4614 MaskV.push_back(i); 4615 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4616 &MaskV[0]); 4617 4618 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4619 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 4620 NewV.getOperand(0), 4621 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 4622 DAG); 4623 } 4624 4625 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4626 // and update MaskVals with the new element order. 4627 if (BestHiQuad >= 0) { 4628 SmallVector<int, 8> MaskV; 4629 for (unsigned i = 0; i != 4; ++i) 4630 MaskV.push_back(i); 4631 for (unsigned i = 4; i != 8; ++i) { 4632 int idx = MaskVals[i]; 4633 if (idx < 0) { 4634 MaskV.push_back(-1); 4635 InOrder.set(i); 4636 } else if ((idx / 4) == BestHiQuad) { 4637 MaskV.push_back((idx & 3) + 4); 4638 InOrder.set(i); 4639 } else { 4640 MaskV.push_back(-1); 4641 } 4642 } 4643 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4644 &MaskV[0]); 4645 4646 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4647 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 4648 NewV.getOperand(0), 4649 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 4650 DAG); 4651 } 4652 4653 // In case BestHi & BestLo were both -1, which means each quadword has a word 4654 // from each of the four input quadwords, calculate the InOrder bitvector now 4655 // before falling through to the insert/extract cleanup. 4656 if (BestLoQuad == -1 && BestHiQuad == -1) { 4657 NewV = V1; 4658 for (int i = 0; i != 8; ++i) 4659 if (MaskVals[i] < 0 || MaskVals[i] == i) 4660 InOrder.set(i); 4661 } 4662 4663 // The other elements are put in the right place using pextrw and pinsrw. 4664 for (unsigned i = 0; i != 8; ++i) { 4665 if (InOrder[i]) 4666 continue; 4667 int EltIdx = MaskVals[i]; 4668 if (EltIdx < 0) 4669 continue; 4670 SDValue ExtOp = (EltIdx < 8) 4671 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4672 DAG.getIntPtrConstant(EltIdx)) 4673 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4674 DAG.getIntPtrConstant(EltIdx - 8)); 4675 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4676 DAG.getIntPtrConstant(i)); 4677 } 4678 return NewV; 4679} 4680 4681// v16i8 shuffles - Prefer shuffles in the following order: 4682// 1. [ssse3] 1 x pshufb 4683// 2. [ssse3] 2 x pshufb + 1 x por 4684// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4685static 4686SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4687 SelectionDAG &DAG, 4688 const X86TargetLowering &TLI) { 4689 SDValue V1 = SVOp->getOperand(0); 4690 SDValue V2 = SVOp->getOperand(1); 4691 DebugLoc dl = SVOp->getDebugLoc(); 4692 SmallVector<int, 16> MaskVals; 4693 SVOp->getMask(MaskVals); 4694 4695 // If we have SSSE3, case 1 is generated when all result bytes come from 4696 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4697 // present, fall back to case 3. 4698 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4699 bool V1Only = true; 4700 bool V2Only = true; 4701 for (unsigned i = 0; i < 16; ++i) { 4702 int EltIdx = MaskVals[i]; 4703 if (EltIdx < 0) 4704 continue; 4705 if (EltIdx < 16) 4706 V2Only = false; 4707 else 4708 V1Only = false; 4709 } 4710 4711 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4712 if (TLI.getSubtarget()->hasSSSE3()) { 4713 SmallVector<SDValue,16> pshufbMask; 4714 4715 // If all result elements are from one input vector, then only translate 4716 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4717 // 4718 // Otherwise, we have elements from both input vectors, and must zero out 4719 // elements that come from V2 in the first mask, and V1 in the second mask 4720 // so that we can OR them together. 4721 bool TwoInputs = !(V1Only || V2Only); 4722 for (unsigned i = 0; i != 16; ++i) { 4723 int EltIdx = MaskVals[i]; 4724 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4725 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4726 continue; 4727 } 4728 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4729 } 4730 // If all the elements are from V2, assign it to V1 and return after 4731 // building the first pshufb. 4732 if (V2Only) 4733 V1 = V2; 4734 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4735 DAG.getNode(ISD::BUILD_VECTOR, dl, 4736 MVT::v16i8, &pshufbMask[0], 16)); 4737 if (!TwoInputs) 4738 return V1; 4739 4740 // Calculate the shuffle mask for the second input, shuffle it, and 4741 // OR it with the first shuffled input. 4742 pshufbMask.clear(); 4743 for (unsigned i = 0; i != 16; ++i) { 4744 int EltIdx = MaskVals[i]; 4745 if (EltIdx < 16) { 4746 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4747 continue; 4748 } 4749 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4750 } 4751 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4752 DAG.getNode(ISD::BUILD_VECTOR, dl, 4753 MVT::v16i8, &pshufbMask[0], 16)); 4754 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4755 } 4756 4757 // No SSSE3 - Calculate in place words and then fix all out of place words 4758 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4759 // the 16 different words that comprise the two doublequadword input vectors. 4760 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4761 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 4762 SDValue NewV = V2Only ? V2 : V1; 4763 for (int i = 0; i != 8; ++i) { 4764 int Elt0 = MaskVals[i*2]; 4765 int Elt1 = MaskVals[i*2+1]; 4766 4767 // This word of the result is all undef, skip it. 4768 if (Elt0 < 0 && Elt1 < 0) 4769 continue; 4770 4771 // This word of the result is already in the correct place, skip it. 4772 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4773 continue; 4774 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4775 continue; 4776 4777 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4778 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4779 SDValue InsElt; 4780 4781 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4782 // using a single extract together, load it and store it. 4783 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4784 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4785 DAG.getIntPtrConstant(Elt1 / 2)); 4786 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4787 DAG.getIntPtrConstant(i)); 4788 continue; 4789 } 4790 4791 // If Elt1 is defined, extract it from the appropriate source. If the 4792 // source byte is not also odd, shift the extracted word left 8 bits 4793 // otherwise clear the bottom 8 bits if we need to do an or. 4794 if (Elt1 >= 0) { 4795 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4796 DAG.getIntPtrConstant(Elt1 / 2)); 4797 if ((Elt1 & 1) == 0) 4798 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4799 DAG.getConstant(8, TLI.getShiftAmountTy())); 4800 else if (Elt0 >= 0) 4801 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4802 DAG.getConstant(0xFF00, MVT::i16)); 4803 } 4804 // If Elt0 is defined, extract it from the appropriate source. If the 4805 // source byte is not also even, shift the extracted word right 8 bits. If 4806 // Elt1 was also defined, OR the extracted values together before 4807 // inserting them in the result. 4808 if (Elt0 >= 0) { 4809 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4810 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4811 if ((Elt0 & 1) != 0) 4812 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4813 DAG.getConstant(8, TLI.getShiftAmountTy())); 4814 else if (Elt1 >= 0) 4815 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4816 DAG.getConstant(0x00FF, MVT::i16)); 4817 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4818 : InsElt0; 4819 } 4820 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4821 DAG.getIntPtrConstant(i)); 4822 } 4823 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 4824} 4825 4826/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4827/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 4828/// done when every pair / quad of shuffle mask elements point to elements in 4829/// the right sequence. e.g. 4830/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 4831static 4832SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4833 SelectionDAG &DAG, DebugLoc dl) { 4834 EVT VT = SVOp->getValueType(0); 4835 SDValue V1 = SVOp->getOperand(0); 4836 SDValue V2 = SVOp->getOperand(1); 4837 unsigned NumElems = VT.getVectorNumElements(); 4838 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4839 EVT NewVT; 4840 switch (VT.getSimpleVT().SimpleTy) { 4841 default: assert(false && "Unexpected!"); 4842 case MVT::v4f32: NewVT = MVT::v2f64; break; 4843 case MVT::v4i32: NewVT = MVT::v2i64; break; 4844 case MVT::v8i16: NewVT = MVT::v4i32; break; 4845 case MVT::v16i8: NewVT = MVT::v4i32; break; 4846 } 4847 4848 int Scale = NumElems / NewWidth; 4849 SmallVector<int, 8> MaskVec; 4850 for (unsigned i = 0; i < NumElems; i += Scale) { 4851 int StartIdx = -1; 4852 for (int j = 0; j < Scale; ++j) { 4853 int EltIdx = SVOp->getMaskElt(i+j); 4854 if (EltIdx < 0) 4855 continue; 4856 if (StartIdx == -1) 4857 StartIdx = EltIdx - (EltIdx % Scale); 4858 if (EltIdx != StartIdx + j) 4859 return SDValue(); 4860 } 4861 if (StartIdx == -1) 4862 MaskVec.push_back(-1); 4863 else 4864 MaskVec.push_back(StartIdx / Scale); 4865 } 4866 4867 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 4868 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 4869 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4870} 4871 4872/// getVZextMovL - Return a zero-extending vector move low node. 4873/// 4874static SDValue getVZextMovL(EVT VT, EVT OpVT, 4875 SDValue SrcOp, SelectionDAG &DAG, 4876 const X86Subtarget *Subtarget, DebugLoc dl) { 4877 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4878 LoadSDNode *LD = NULL; 4879 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4880 LD = dyn_cast<LoadSDNode>(SrcOp); 4881 if (!LD) { 4882 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4883 // instead. 4884 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4885 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 4886 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4887 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 4888 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4889 // PR2108 4890 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4891 return DAG.getNode(ISD::BITCAST, dl, VT, 4892 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4893 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4894 OpVT, 4895 SrcOp.getOperand(0) 4896 .getOperand(0)))); 4897 } 4898 } 4899 } 4900 4901 return DAG.getNode(ISD::BITCAST, dl, VT, 4902 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4903 DAG.getNode(ISD::BITCAST, dl, 4904 OpVT, SrcOp))); 4905} 4906 4907/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4908/// shuffles. 4909static SDValue 4910LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4911 SDValue V1 = SVOp->getOperand(0); 4912 SDValue V2 = SVOp->getOperand(1); 4913 DebugLoc dl = SVOp->getDebugLoc(); 4914 EVT VT = SVOp->getValueType(0); 4915 4916 SmallVector<std::pair<int, int>, 8> Locs; 4917 Locs.resize(4); 4918 SmallVector<int, 8> Mask1(4U, -1); 4919 SmallVector<int, 8> PermMask; 4920 SVOp->getMask(PermMask); 4921 4922 unsigned NumHi = 0; 4923 unsigned NumLo = 0; 4924 for (unsigned i = 0; i != 4; ++i) { 4925 int Idx = PermMask[i]; 4926 if (Idx < 0) { 4927 Locs[i] = std::make_pair(-1, -1); 4928 } else { 4929 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4930 if (Idx < 4) { 4931 Locs[i] = std::make_pair(0, NumLo); 4932 Mask1[NumLo] = Idx; 4933 NumLo++; 4934 } else { 4935 Locs[i] = std::make_pair(1, NumHi); 4936 if (2+NumHi < 4) 4937 Mask1[2+NumHi] = Idx; 4938 NumHi++; 4939 } 4940 } 4941 } 4942 4943 if (NumLo <= 2 && NumHi <= 2) { 4944 // If no more than two elements come from either vector. This can be 4945 // implemented with two shuffles. First shuffle gather the elements. 4946 // The second shuffle, which takes the first shuffle as both of its 4947 // vector operands, put the elements into the right order. 4948 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4949 4950 SmallVector<int, 8> Mask2(4U, -1); 4951 4952 for (unsigned i = 0; i != 4; ++i) { 4953 if (Locs[i].first == -1) 4954 continue; 4955 else { 4956 unsigned Idx = (i < 2) ? 0 : 4; 4957 Idx += Locs[i].first * 2 + Locs[i].second; 4958 Mask2[i] = Idx; 4959 } 4960 } 4961 4962 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4963 } else if (NumLo == 3 || NumHi == 3) { 4964 // Otherwise, we must have three elements from one vector, call it X, and 4965 // one element from the other, call it Y. First, use a shufps to build an 4966 // intermediate vector with the one element from Y and the element from X 4967 // that will be in the same half in the final destination (the indexes don't 4968 // matter). Then, use a shufps to build the final vector, taking the half 4969 // containing the element from Y from the intermediate, and the other half 4970 // from X. 4971 if (NumHi == 3) { 4972 // Normalize it so the 3 elements come from V1. 4973 CommuteVectorShuffleMask(PermMask, VT); 4974 std::swap(V1, V2); 4975 } 4976 4977 // Find the element from V2. 4978 unsigned HiIndex; 4979 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4980 int Val = PermMask[HiIndex]; 4981 if (Val < 0) 4982 continue; 4983 if (Val >= 4) 4984 break; 4985 } 4986 4987 Mask1[0] = PermMask[HiIndex]; 4988 Mask1[1] = -1; 4989 Mask1[2] = PermMask[HiIndex^1]; 4990 Mask1[3] = -1; 4991 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4992 4993 if (HiIndex >= 2) { 4994 Mask1[0] = PermMask[0]; 4995 Mask1[1] = PermMask[1]; 4996 Mask1[2] = HiIndex & 1 ? 6 : 4; 4997 Mask1[3] = HiIndex & 1 ? 4 : 6; 4998 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4999 } else { 5000 Mask1[0] = HiIndex & 1 ? 2 : 0; 5001 Mask1[1] = HiIndex & 1 ? 0 : 2; 5002 Mask1[2] = PermMask[2]; 5003 Mask1[3] = PermMask[3]; 5004 if (Mask1[2] >= 0) 5005 Mask1[2] += 4; 5006 if (Mask1[3] >= 0) 5007 Mask1[3] += 4; 5008 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5009 } 5010 } 5011 5012 // Break it into (shuffle shuffle_hi, shuffle_lo). 5013 Locs.clear(); 5014 SmallVector<int,8> LoMask(4U, -1); 5015 SmallVector<int,8> HiMask(4U, -1); 5016 5017 SmallVector<int,8> *MaskPtr = &LoMask; 5018 unsigned MaskIdx = 0; 5019 unsigned LoIdx = 0; 5020 unsigned HiIdx = 2; 5021 for (unsigned i = 0; i != 4; ++i) { 5022 if (i == 2) { 5023 MaskPtr = &HiMask; 5024 MaskIdx = 1; 5025 LoIdx = 0; 5026 HiIdx = 2; 5027 } 5028 int Idx = PermMask[i]; 5029 if (Idx < 0) { 5030 Locs[i] = std::make_pair(-1, -1); 5031 } else if (Idx < 4) { 5032 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5033 (*MaskPtr)[LoIdx] = Idx; 5034 LoIdx++; 5035 } else { 5036 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5037 (*MaskPtr)[HiIdx] = Idx; 5038 HiIdx++; 5039 } 5040 } 5041 5042 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5043 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5044 SmallVector<int, 8> MaskOps; 5045 for (unsigned i = 0; i != 4; ++i) { 5046 if (Locs[i].first == -1) { 5047 MaskOps.push_back(-1); 5048 } else { 5049 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5050 MaskOps.push_back(Idx); 5051 } 5052 } 5053 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5054} 5055 5056static bool MayFoldVectorLoad(SDValue V) { 5057 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5058 V = V.getOperand(0); 5059 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5060 V = V.getOperand(0); 5061 if (MayFoldLoad(V)) 5062 return true; 5063 return false; 5064} 5065 5066// FIXME: the version above should always be used. Since there's 5067// a bug where several vector shuffles can't be folded because the 5068// DAG is not updated during lowering and a node claims to have two 5069// uses while it only has one, use this version, and let isel match 5070// another instruction if the load really happens to have more than 5071// one use. Remove this version after this bug get fixed. 5072// rdar://8434668, PR8156 5073static bool RelaxedMayFoldVectorLoad(SDValue V) { 5074 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5075 V = V.getOperand(0); 5076 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5077 V = V.getOperand(0); 5078 if (ISD::isNormalLoad(V.getNode())) 5079 return true; 5080 return false; 5081} 5082 5083/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5084/// a vector extract, and if both can be later optimized into a single load. 5085/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5086/// here because otherwise a target specific shuffle node is going to be 5087/// emitted for this shuffle, and the optimization not done. 5088/// FIXME: This is probably not the best approach, but fix the problem 5089/// until the right path is decided. 5090static 5091bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5092 const TargetLowering &TLI) { 5093 EVT VT = V.getValueType(); 5094 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5095 5096 // Be sure that the vector shuffle is present in a pattern like this: 5097 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5098 if (!V.hasOneUse()) 5099 return false; 5100 5101 SDNode *N = *V.getNode()->use_begin(); 5102 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5103 return false; 5104 5105 SDValue EltNo = N->getOperand(1); 5106 if (!isa<ConstantSDNode>(EltNo)) 5107 return false; 5108 5109 // If the bit convert changed the number of elements, it is unsafe 5110 // to examine the mask. 5111 bool HasShuffleIntoBitcast = false; 5112 if (V.getOpcode() == ISD::BITCAST) { 5113 EVT SrcVT = V.getOperand(0).getValueType(); 5114 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5115 return false; 5116 V = V.getOperand(0); 5117 HasShuffleIntoBitcast = true; 5118 } 5119 5120 // Select the input vector, guarding against out of range extract vector. 5121 unsigned NumElems = VT.getVectorNumElements(); 5122 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5123 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5124 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5125 5126 // Skip one more bit_convert if necessary 5127 if (V.getOpcode() == ISD::BITCAST) 5128 V = V.getOperand(0); 5129 5130 if (ISD::isNormalLoad(V.getNode())) { 5131 // Is the original load suitable? 5132 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5133 5134 // FIXME: avoid the multi-use bug that is preventing lots of 5135 // of foldings to be detected, this is still wrong of course, but 5136 // give the temporary desired behavior, and if it happens that 5137 // the load has real more uses, during isel it will not fold, and 5138 // will generate poor code. 5139 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5140 return false; 5141 5142 if (!HasShuffleIntoBitcast) 5143 return true; 5144 5145 // If there's a bitcast before the shuffle, check if the load type and 5146 // alignment is valid. 5147 unsigned Align = LN0->getAlignment(); 5148 unsigned NewAlign = 5149 TLI.getTargetData()->getABITypeAlignment( 5150 VT.getTypeForEVT(*DAG.getContext())); 5151 5152 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5153 return false; 5154 } 5155 5156 return true; 5157} 5158 5159static 5160SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 5161 EVT VT = Op.getValueType(); 5162 5163 // Canonizalize to v2f64. 5164 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 5165 return DAG.getNode(ISD::BITCAST, dl, VT, 5166 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 5167 V1, DAG)); 5168} 5169 5170static 5171SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5172 bool HasSSE2) { 5173 SDValue V1 = Op.getOperand(0); 5174 SDValue V2 = Op.getOperand(1); 5175 EVT VT = Op.getValueType(); 5176 5177 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5178 5179 if (HasSSE2 && VT == MVT::v2f64) 5180 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5181 5182 // v4f32 or v4i32 5183 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5184} 5185 5186static 5187SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5188 SDValue V1 = Op.getOperand(0); 5189 SDValue V2 = Op.getOperand(1); 5190 EVT VT = Op.getValueType(); 5191 5192 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5193 "unsupported shuffle type"); 5194 5195 if (V2.getOpcode() == ISD::UNDEF) 5196 V2 = V1; 5197 5198 // v4i32 or v4f32 5199 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5200} 5201 5202static 5203SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5204 SDValue V1 = Op.getOperand(0); 5205 SDValue V2 = Op.getOperand(1); 5206 EVT VT = Op.getValueType(); 5207 unsigned NumElems = VT.getVectorNumElements(); 5208 5209 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5210 // operand of these instructions is only memory, so check if there's a 5211 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5212 // same masks. 5213 bool CanFoldLoad = false; 5214 5215 // Trivial case, when V2 comes from a load. 5216 if (MayFoldVectorLoad(V2)) 5217 CanFoldLoad = true; 5218 5219 // When V1 is a load, it can be folded later into a store in isel, example: 5220 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5221 // turns into: 5222 // (MOVLPSmr addr:$src1, VR128:$src2) 5223 // So, recognize this potential and also use MOVLPS or MOVLPD 5224 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 5225 CanFoldLoad = true; 5226 5227 if (CanFoldLoad) { 5228 if (HasSSE2 && NumElems == 2) 5229 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5230 5231 if (NumElems == 4) 5232 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5233 } 5234 5235 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5236 // movl and movlp will both match v2i64, but v2i64 is never matched by 5237 // movl earlier because we make it strict to avoid messing with the movlp load 5238 // folding logic (see the code above getMOVLP call). Match it here then, 5239 // this is horrible, but will stay like this until we move all shuffle 5240 // matching to x86 specific nodes. Note that for the 1st condition all 5241 // types are matched with movsd. 5242 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5243 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5244 else if (HasSSE2) 5245 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5246 5247 5248 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5249 5250 // Invert the operand order and use SHUFPS to match it. 5251 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5252 X86::getShuffleSHUFImmediate(SVOp), DAG); 5253} 5254 5255static inline unsigned getUNPCKLOpcode(EVT VT) { 5256 switch(VT.getSimpleVT().SimpleTy) { 5257 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 5258 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 5259 case MVT::v4f32: return X86ISD::UNPCKLPS; 5260 case MVT::v2f64: return X86ISD::UNPCKLPD; 5261 case MVT::v16i8: return X86ISD::PUNPCKLBW; 5262 case MVT::v8i16: return X86ISD::PUNPCKLWD; 5263 default: 5264 llvm_unreachable("Unknow type for unpckl"); 5265 } 5266 return 0; 5267} 5268 5269static inline unsigned getUNPCKHOpcode(EVT VT) { 5270 switch(VT.getSimpleVT().SimpleTy) { 5271 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 5272 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 5273 case MVT::v4f32: return X86ISD::UNPCKHPS; 5274 case MVT::v2f64: return X86ISD::UNPCKHPD; 5275 case MVT::v16i8: return X86ISD::PUNPCKHBW; 5276 case MVT::v8i16: return X86ISD::PUNPCKHWD; 5277 default: 5278 llvm_unreachable("Unknow type for unpckh"); 5279 } 5280 return 0; 5281} 5282 5283static 5284SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 5285 const TargetLowering &TLI, 5286 const X86Subtarget *Subtarget) { 5287 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5288 EVT VT = Op.getValueType(); 5289 DebugLoc dl = Op.getDebugLoc(); 5290 SDValue V1 = Op.getOperand(0); 5291 SDValue V2 = Op.getOperand(1); 5292 5293 if (isZeroShuffle(SVOp)) 5294 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5295 5296 // Handle splat operations 5297 if (SVOp->isSplat()) { 5298 // Special case, this is the only place now where it's 5299 // allowed to return a vector_shuffle operation without 5300 // using a target specific node, because *hopefully* it 5301 // will be optimized away by the dag combiner. 5302 if (VT.getVectorNumElements() <= 4 && 5303 CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 5304 return Op; 5305 5306 // Handle splats by matching through known masks 5307 if (VT.getVectorNumElements() <= 4) 5308 return SDValue(); 5309 5310 // Canonicalize all of the remaining to v4f32. 5311 return PromoteSplat(SVOp, DAG); 5312 } 5313 5314 // If the shuffle can be profitably rewritten as a narrower shuffle, then 5315 // do it! 5316 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 5317 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5318 if (NewOp.getNode()) 5319 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 5320 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 5321 // FIXME: Figure out a cleaner way to do this. 5322 // Try to make use of movq to zero out the top part. 5323 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 5324 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5325 if (NewOp.getNode()) { 5326 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 5327 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 5328 DAG, Subtarget, dl); 5329 } 5330 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 5331 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5332 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 5333 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 5334 DAG, Subtarget, dl); 5335 } 5336 } 5337 return SDValue(); 5338} 5339 5340SDValue 5341X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 5342 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5343 SDValue V1 = Op.getOperand(0); 5344 SDValue V2 = Op.getOperand(1); 5345 EVT VT = Op.getValueType(); 5346 DebugLoc dl = Op.getDebugLoc(); 5347 unsigned NumElems = VT.getVectorNumElements(); 5348 bool isMMX = VT.getSizeInBits() == 64; 5349 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 5350 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5351 bool V1IsSplat = false; 5352 bool V2IsSplat = false; 5353 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 5354 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 5355 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 5356 MachineFunction &MF = DAG.getMachineFunction(); 5357 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 5358 5359 // Shuffle operations on MMX not supported. 5360 if (isMMX) 5361 return Op; 5362 5363 // Vector shuffle lowering takes 3 steps: 5364 // 5365 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 5366 // narrowing and commutation of operands should be handled. 5367 // 2) Matching of shuffles with known shuffle masks to x86 target specific 5368 // shuffle nodes. 5369 // 3) Rewriting of unmatched masks into new generic shuffle operations, 5370 // so the shuffle can be broken into other shuffles and the legalizer can 5371 // try the lowering again. 5372 // 5373 // The general ideia is that no vector_shuffle operation should be left to 5374 // be matched during isel, all of them must be converted to a target specific 5375 // node here. 5376 5377 // Normalize the input vectors. Here splats, zeroed vectors, profitable 5378 // narrowing and commutation of operands should be handled. The actual code 5379 // doesn't include all of those, work in progress... 5380 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 5381 if (NewOp.getNode()) 5382 return NewOp; 5383 5384 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 5385 // unpckh_undef). Only use pshufd if speed is more important than size. 5386 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 5387 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5388 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5389 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 5390 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5391 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5392 5393 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 5394 RelaxedMayFoldVectorLoad(V1)) 5395 return getMOVDDup(Op, dl, V1, DAG); 5396 5397 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 5398 return getMOVHighToLow(Op, dl, DAG); 5399 5400 // Use to match splats 5401 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 5402 (VT == MVT::v2f64 || VT == MVT::v2i64)) 5403 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5404 5405 if (X86::isPSHUFDMask(SVOp)) { 5406 // The actual implementation will match the mask in the if above and then 5407 // during isel it can match several different instructions, not only pshufd 5408 // as its name says, sad but true, emulate the behavior for now... 5409 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 5410 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 5411 5412 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5413 5414 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 5415 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 5416 5417 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5418 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 5419 TargetMask, DAG); 5420 5421 if (VT == MVT::v4f32) 5422 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 5423 TargetMask, DAG); 5424 } 5425 5426 // Check if this can be converted into a logical shift. 5427 bool isLeft = false; 5428 unsigned ShAmt = 0; 5429 SDValue ShVal; 5430 bool isShift = getSubtarget()->hasSSE2() && 5431 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 5432 if (isShift && ShVal.hasOneUse()) { 5433 // If the shifted value has multiple uses, it may be cheaper to use 5434 // v_set0 + movlhps or movhlps, etc. 5435 EVT EltVT = VT.getVectorElementType(); 5436 ShAmt *= EltVT.getSizeInBits(); 5437 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5438 } 5439 5440 if (X86::isMOVLMask(SVOp)) { 5441 if (V1IsUndef) 5442 return V2; 5443 if (ISD::isBuildVectorAllZeros(V1.getNode())) 5444 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 5445 if (!X86::isMOVLPMask(SVOp)) { 5446 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5447 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5448 5449 if (VT == MVT::v4i32 || VT == MVT::v4f32) 5450 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5451 } 5452 } 5453 5454 // FIXME: fold these into legal mask. 5455 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 5456 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 5457 5458 if (X86::isMOVHLPSMask(SVOp)) 5459 return getMOVHighToLow(Op, dl, DAG); 5460 5461 if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5462 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 5463 5464 if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5465 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 5466 5467 if (X86::isMOVLPMask(SVOp)) 5468 return getMOVLP(Op, dl, DAG, HasSSE2); 5469 5470 if (ShouldXformToMOVHLPS(SVOp) || 5471 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 5472 return CommuteVectorShuffle(SVOp, DAG); 5473 5474 if (isShift) { 5475 // No better options. Use a vshl / vsrl. 5476 EVT EltVT = VT.getVectorElementType(); 5477 ShAmt *= EltVT.getSizeInBits(); 5478 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5479 } 5480 5481 bool Commuted = false; 5482 // FIXME: This should also accept a bitcast of a splat? Be careful, not 5483 // 1,1,1,1 -> v8i16 though. 5484 V1IsSplat = isSplatVector(V1.getNode()); 5485 V2IsSplat = isSplatVector(V2.getNode()); 5486 5487 // Canonicalize the splat or undef, if present, to be on the RHS. 5488 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 5489 Op = CommuteVectorShuffle(SVOp, DAG); 5490 SVOp = cast<ShuffleVectorSDNode>(Op); 5491 V1 = SVOp->getOperand(0); 5492 V2 = SVOp->getOperand(1); 5493 std::swap(V1IsSplat, V2IsSplat); 5494 std::swap(V1IsUndef, V2IsUndef); 5495 Commuted = true; 5496 } 5497 5498 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 5499 // Shuffling low element of v1 into undef, just return v1. 5500 if (V2IsUndef) 5501 return V1; 5502 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 5503 // the instruction selector will not match, so get a canonical MOVL with 5504 // swapped operands to undo the commute. 5505 return getMOVL(DAG, dl, VT, V2, V1); 5506 } 5507 5508 if (X86::isUNPCKLMask(SVOp)) 5509 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); 5510 5511 if (X86::isUNPCKHMask(SVOp)) 5512 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 5513 5514 if (V2IsSplat) { 5515 // Normalize mask so all entries that point to V2 points to its first 5516 // element then try to match unpck{h|l} again. If match, return a 5517 // new vector_shuffle with the corrected mask. 5518 SDValue NewMask = NormalizeMask(SVOp, DAG); 5519 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 5520 if (NSVOp != SVOp) { 5521 if (X86::isUNPCKLMask(NSVOp, true)) { 5522 return NewMask; 5523 } else if (X86::isUNPCKHMask(NSVOp, true)) { 5524 return NewMask; 5525 } 5526 } 5527 } 5528 5529 if (Commuted) { 5530 // Commute is back and try unpck* again. 5531 // FIXME: this seems wrong. 5532 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 5533 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 5534 5535 if (X86::isUNPCKLMask(NewSVOp)) 5536 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); 5537 5538 if (X86::isUNPCKHMask(NewSVOp)) 5539 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 5540 } 5541 5542 // Normalize the node to match x86 shuffle ops if needed 5543 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 5544 return CommuteVectorShuffle(SVOp, DAG); 5545 5546 // The checks below are all present in isShuffleMaskLegal, but they are 5547 // inlined here right now to enable us to directly emit target specific 5548 // nodes, and remove one by one until they don't return Op anymore. 5549 SmallVector<int, 16> M; 5550 SVOp->getMask(M); 5551 5552 if (isPALIGNRMask(M, VT, HasSSSE3)) 5553 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 5554 X86::getShufflePALIGNRImmediate(SVOp), 5555 DAG); 5556 5557 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 5558 SVOp->getSplatIndex() == 0 && V2IsUndef) { 5559 if (VT == MVT::v2f64) 5560 return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); 5561 if (VT == MVT::v2i64) 5562 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 5563 } 5564 5565 if (isPSHUFHWMask(M, VT)) 5566 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 5567 X86::getShufflePSHUFHWImmediate(SVOp), 5568 DAG); 5569 5570 if (isPSHUFLWMask(M, VT)) 5571 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 5572 X86::getShufflePSHUFLWImmediate(SVOp), 5573 DAG); 5574 5575 if (isSHUFPMask(M, VT)) { 5576 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5577 if (VT == MVT::v4f32 || VT == MVT::v4i32) 5578 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 5579 TargetMask, DAG); 5580 if (VT == MVT::v2f64 || VT == MVT::v2i64) 5581 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 5582 TargetMask, DAG); 5583 } 5584 5585 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 5586 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5587 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5588 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 5589 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5590 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5591 5592 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 5593 if (VT == MVT::v8i16) { 5594 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 5595 if (NewOp.getNode()) 5596 return NewOp; 5597 } 5598 5599 if (VT == MVT::v16i8) { 5600 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 5601 if (NewOp.getNode()) 5602 return NewOp; 5603 } 5604 5605 // Handle all 4 wide cases with a number of shuffles. 5606 if (NumElems == 4) 5607 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 5608 5609 return SDValue(); 5610} 5611 5612SDValue 5613X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 5614 SelectionDAG &DAG) const { 5615 EVT VT = Op.getValueType(); 5616 DebugLoc dl = Op.getDebugLoc(); 5617 if (VT.getSizeInBits() == 8) { 5618 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 5619 Op.getOperand(0), Op.getOperand(1)); 5620 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5621 DAG.getValueType(VT)); 5622 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5623 } else if (VT.getSizeInBits() == 16) { 5624 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5625 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 5626 if (Idx == 0) 5627 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5628 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5629 DAG.getNode(ISD::BITCAST, dl, 5630 MVT::v4i32, 5631 Op.getOperand(0)), 5632 Op.getOperand(1))); 5633 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5634 Op.getOperand(0), Op.getOperand(1)); 5635 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5636 DAG.getValueType(VT)); 5637 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5638 } else if (VT == MVT::f32) { 5639 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5640 // the result back to FR32 register. It's only worth matching if the 5641 // result has a single use which is a store or a bitcast to i32. And in 5642 // the case of a store, it's not worth it if the index is a constant 0, 5643 // because a MOVSSmr can be used instead, which is smaller and faster. 5644 if (!Op.hasOneUse()) 5645 return SDValue(); 5646 SDNode *User = *Op.getNode()->use_begin(); 5647 if ((User->getOpcode() != ISD::STORE || 5648 (isa<ConstantSDNode>(Op.getOperand(1)) && 5649 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5650 (User->getOpcode() != ISD::BITCAST || 5651 User->getValueType(0) != MVT::i32)) 5652 return SDValue(); 5653 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5654 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 5655 Op.getOperand(0)), 5656 Op.getOperand(1)); 5657 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 5658 } else if (VT == MVT::i32) { 5659 // ExtractPS works with constant index. 5660 if (isa<ConstantSDNode>(Op.getOperand(1))) 5661 return Op; 5662 } 5663 return SDValue(); 5664} 5665 5666 5667SDValue 5668X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5669 SelectionDAG &DAG) const { 5670 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5671 return SDValue(); 5672 5673 if (Subtarget->hasSSE41()) { 5674 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5675 if (Res.getNode()) 5676 return Res; 5677 } 5678 5679 EVT VT = Op.getValueType(); 5680 DebugLoc dl = Op.getDebugLoc(); 5681 // TODO: handle v16i8. 5682 if (VT.getSizeInBits() == 16) { 5683 SDValue Vec = Op.getOperand(0); 5684 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5685 if (Idx == 0) 5686 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5687 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5688 DAG.getNode(ISD::BITCAST, dl, 5689 MVT::v4i32, Vec), 5690 Op.getOperand(1))); 5691 // Transform it so it match pextrw which produces a 32-bit result. 5692 EVT EltVT = MVT::i32; 5693 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5694 Op.getOperand(0), Op.getOperand(1)); 5695 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5696 DAG.getValueType(VT)); 5697 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5698 } else if (VT.getSizeInBits() == 32) { 5699 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5700 if (Idx == 0) 5701 return Op; 5702 5703 // SHUFPS the element to the lowest double word, then movss. 5704 int Mask[4] = { Idx, -1, -1, -1 }; 5705 EVT VVT = Op.getOperand(0).getValueType(); 5706 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5707 DAG.getUNDEF(VVT), Mask); 5708 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5709 DAG.getIntPtrConstant(0)); 5710 } else if (VT.getSizeInBits() == 64) { 5711 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5712 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5713 // to match extract_elt for f64. 5714 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5715 if (Idx == 0) 5716 return Op; 5717 5718 // UNPCKHPD the element to the lowest double word, then movsd. 5719 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5720 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5721 int Mask[2] = { 1, -1 }; 5722 EVT VVT = Op.getOperand(0).getValueType(); 5723 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5724 DAG.getUNDEF(VVT), Mask); 5725 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5726 DAG.getIntPtrConstant(0)); 5727 } 5728 5729 return SDValue(); 5730} 5731 5732SDValue 5733X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5734 SelectionDAG &DAG) const { 5735 EVT VT = Op.getValueType(); 5736 EVT EltVT = VT.getVectorElementType(); 5737 DebugLoc dl = Op.getDebugLoc(); 5738 5739 SDValue N0 = Op.getOperand(0); 5740 SDValue N1 = Op.getOperand(1); 5741 SDValue N2 = Op.getOperand(2); 5742 5743 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5744 isa<ConstantSDNode>(N2)) { 5745 unsigned Opc; 5746 if (VT == MVT::v8i16) 5747 Opc = X86ISD::PINSRW; 5748 else if (VT == MVT::v16i8) 5749 Opc = X86ISD::PINSRB; 5750 else 5751 Opc = X86ISD::PINSRB; 5752 5753 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5754 // argument. 5755 if (N1.getValueType() != MVT::i32) 5756 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5757 if (N2.getValueType() != MVT::i32) 5758 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5759 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5760 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5761 // Bits [7:6] of the constant are the source select. This will always be 5762 // zero here. The DAG Combiner may combine an extract_elt index into these 5763 // bits. For example (insert (extract, 3), 2) could be matched by putting 5764 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5765 // Bits [5:4] of the constant are the destination select. This is the 5766 // value of the incoming immediate. 5767 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5768 // combine either bitwise AND or insert of float 0.0 to set these bits. 5769 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5770 // Create this as a scalar to vector.. 5771 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5772 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5773 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5774 // PINSR* works with constant index. 5775 return Op; 5776 } 5777 return SDValue(); 5778} 5779 5780SDValue 5781X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5782 EVT VT = Op.getValueType(); 5783 EVT EltVT = VT.getVectorElementType(); 5784 5785 if (Subtarget->hasSSE41()) 5786 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5787 5788 if (EltVT == MVT::i8) 5789 return SDValue(); 5790 5791 DebugLoc dl = Op.getDebugLoc(); 5792 SDValue N0 = Op.getOperand(0); 5793 SDValue N1 = Op.getOperand(1); 5794 SDValue N2 = Op.getOperand(2); 5795 5796 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5797 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5798 // as its second argument. 5799 if (N1.getValueType() != MVT::i32) 5800 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5801 if (N2.getValueType() != MVT::i32) 5802 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5803 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 5804 } 5805 return SDValue(); 5806} 5807 5808SDValue 5809X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5810 DebugLoc dl = Op.getDebugLoc(); 5811 5812 if (Op.getValueType() == MVT::v1i64 && 5813 Op.getOperand(0).getValueType() == MVT::i64) 5814 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5815 5816 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5817 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 5818 "Expected an SSE type!"); 5819 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 5820 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 5821} 5822 5823// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 5824// a simple subregister reference or explicit instructions to grab 5825// upper bits of a vector. 5826SDValue 5827X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 5828 if (Subtarget->hasAVX()) { 5829 // TODO 5830 } 5831 return SDValue(); 5832} 5833 5834// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 5835// simple superregister reference or explicit instructions to insert 5836// the upper bits of a vector. 5837SDValue 5838X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 5839 if (Subtarget->hasAVX()) { 5840 DebugLoc dl = Op.getNode()->getDebugLoc(); 5841 SDValue Vec = Op.getNode()->getOperand(0); 5842 SDValue SubVec = Op.getNode()->getOperand(1); 5843 SDValue Idx = Op.getNode()->getOperand(2); 5844 5845 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 5846 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 5847 // TODO 5848 } 5849 } 5850 return SDValue(); 5851} 5852 5853// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5854// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5855// one of the above mentioned nodes. It has to be wrapped because otherwise 5856// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5857// be used to form addressing mode. These wrapped nodes will be selected 5858// into MOV32ri. 5859SDValue 5860X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5861 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5862 5863 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5864 // global base reg. 5865 unsigned char OpFlag = 0; 5866 unsigned WrapperKind = X86ISD::Wrapper; 5867 CodeModel::Model M = getTargetMachine().getCodeModel(); 5868 5869 if (Subtarget->isPICStyleRIPRel() && 5870 (M == CodeModel::Small || M == CodeModel::Kernel)) 5871 WrapperKind = X86ISD::WrapperRIP; 5872 else if (Subtarget->isPICStyleGOT()) 5873 OpFlag = X86II::MO_GOTOFF; 5874 else if (Subtarget->isPICStyleStubPIC()) 5875 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5876 5877 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5878 CP->getAlignment(), 5879 CP->getOffset(), OpFlag); 5880 DebugLoc DL = CP->getDebugLoc(); 5881 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5882 // With PIC, the address is actually $g + Offset. 5883 if (OpFlag) { 5884 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5885 DAG.getNode(X86ISD::GlobalBaseReg, 5886 DebugLoc(), getPointerTy()), 5887 Result); 5888 } 5889 5890 return Result; 5891} 5892 5893SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5894 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5895 5896 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5897 // global base reg. 5898 unsigned char OpFlag = 0; 5899 unsigned WrapperKind = X86ISD::Wrapper; 5900 CodeModel::Model M = getTargetMachine().getCodeModel(); 5901 5902 if (Subtarget->isPICStyleRIPRel() && 5903 (M == CodeModel::Small || M == CodeModel::Kernel)) 5904 WrapperKind = X86ISD::WrapperRIP; 5905 else if (Subtarget->isPICStyleGOT()) 5906 OpFlag = X86II::MO_GOTOFF; 5907 else if (Subtarget->isPICStyleStubPIC()) 5908 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5909 5910 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5911 OpFlag); 5912 DebugLoc DL = JT->getDebugLoc(); 5913 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5914 5915 // With PIC, the address is actually $g + Offset. 5916 if (OpFlag) 5917 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5918 DAG.getNode(X86ISD::GlobalBaseReg, 5919 DebugLoc(), getPointerTy()), 5920 Result); 5921 5922 return Result; 5923} 5924 5925SDValue 5926X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5927 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5928 5929 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5930 // global base reg. 5931 unsigned char OpFlag = 0; 5932 unsigned WrapperKind = X86ISD::Wrapper; 5933 CodeModel::Model M = getTargetMachine().getCodeModel(); 5934 5935 if (Subtarget->isPICStyleRIPRel() && 5936 (M == CodeModel::Small || M == CodeModel::Kernel)) 5937 WrapperKind = X86ISD::WrapperRIP; 5938 else if (Subtarget->isPICStyleGOT()) 5939 OpFlag = X86II::MO_GOTOFF; 5940 else if (Subtarget->isPICStyleStubPIC()) 5941 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5942 5943 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5944 5945 DebugLoc DL = Op.getDebugLoc(); 5946 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5947 5948 5949 // With PIC, the address is actually $g + Offset. 5950 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5951 !Subtarget->is64Bit()) { 5952 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5953 DAG.getNode(X86ISD::GlobalBaseReg, 5954 DebugLoc(), getPointerTy()), 5955 Result); 5956 } 5957 5958 return Result; 5959} 5960 5961SDValue 5962X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5963 // Create the TargetBlockAddressAddress node. 5964 unsigned char OpFlags = 5965 Subtarget->ClassifyBlockAddressReference(); 5966 CodeModel::Model M = getTargetMachine().getCodeModel(); 5967 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5968 DebugLoc dl = Op.getDebugLoc(); 5969 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5970 /*isTarget=*/true, OpFlags); 5971 5972 if (Subtarget->isPICStyleRIPRel() && 5973 (M == CodeModel::Small || M == CodeModel::Kernel)) 5974 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5975 else 5976 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5977 5978 // With PIC, the address is actually $g + Offset. 5979 if (isGlobalRelativeToPICBase(OpFlags)) { 5980 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5981 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5982 Result); 5983 } 5984 5985 return Result; 5986} 5987 5988SDValue 5989X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5990 int64_t Offset, 5991 SelectionDAG &DAG) const { 5992 // Create the TargetGlobalAddress node, folding in the constant 5993 // offset if it is legal. 5994 unsigned char OpFlags = 5995 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5996 CodeModel::Model M = getTargetMachine().getCodeModel(); 5997 SDValue Result; 5998 if (OpFlags == X86II::MO_NO_FLAG && 5999 X86::isOffsetSuitableForCodeModel(Offset, M)) { 6000 // A direct static reference to a global. 6001 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 6002 Offset = 0; 6003 } else { 6004 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 6005 } 6006 6007 if (Subtarget->isPICStyleRIPRel() && 6008 (M == CodeModel::Small || M == CodeModel::Kernel)) 6009 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6010 else 6011 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6012 6013 // With PIC, the address is actually $g + Offset. 6014 if (isGlobalRelativeToPICBase(OpFlags)) { 6015 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6016 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6017 Result); 6018 } 6019 6020 // For globals that require a load from a stub to get the address, emit the 6021 // load. 6022 if (isGlobalStubReference(OpFlags)) 6023 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 6024 MachinePointerInfo::getGOT(), false, false, 0); 6025 6026 // If there was a non-zero offset that we didn't fold, create an explicit 6027 // addition for it. 6028 if (Offset != 0) 6029 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 6030 DAG.getConstant(Offset, getPointerTy())); 6031 6032 return Result; 6033} 6034 6035SDValue 6036X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 6037 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6038 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 6039 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 6040} 6041 6042static SDValue 6043GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 6044 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 6045 unsigned char OperandFlags) { 6046 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6047 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6048 DebugLoc dl = GA->getDebugLoc(); 6049 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6050 GA->getValueType(0), 6051 GA->getOffset(), 6052 OperandFlags); 6053 if (InFlag) { 6054 SDValue Ops[] = { Chain, TGA, *InFlag }; 6055 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 6056 } else { 6057 SDValue Ops[] = { Chain, TGA }; 6058 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 6059 } 6060 6061 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 6062 MFI->setAdjustsStack(true); 6063 6064 SDValue Flag = Chain.getValue(1); 6065 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 6066} 6067 6068// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 6069static SDValue 6070LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6071 const EVT PtrVT) { 6072 SDValue InFlag; 6073 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 6074 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 6075 DAG.getNode(X86ISD::GlobalBaseReg, 6076 DebugLoc(), PtrVT), InFlag); 6077 InFlag = Chain.getValue(1); 6078 6079 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 6080} 6081 6082// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 6083static SDValue 6084LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6085 const EVT PtrVT) { 6086 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 6087 X86::RAX, X86II::MO_TLSGD); 6088} 6089 6090// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 6091// "local exec" model. 6092static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6093 const EVT PtrVT, TLSModel::Model model, 6094 bool is64Bit) { 6095 DebugLoc dl = GA->getDebugLoc(); 6096 6097 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 6098 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 6099 is64Bit ? 257 : 256)); 6100 6101 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 6102 DAG.getIntPtrConstant(0), 6103 MachinePointerInfo(Ptr), false, false, 0); 6104 6105 unsigned char OperandFlags = 0; 6106 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 6107 // initialexec. 6108 unsigned WrapperKind = X86ISD::Wrapper; 6109 if (model == TLSModel::LocalExec) { 6110 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 6111 } else if (is64Bit) { 6112 assert(model == TLSModel::InitialExec); 6113 OperandFlags = X86II::MO_GOTTPOFF; 6114 WrapperKind = X86ISD::WrapperRIP; 6115 } else { 6116 assert(model == TLSModel::InitialExec); 6117 OperandFlags = X86II::MO_INDNTPOFF; 6118 } 6119 6120 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 6121 // exec) 6122 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6123 GA->getValueType(0), 6124 GA->getOffset(), OperandFlags); 6125 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 6126 6127 if (model == TLSModel::InitialExec) 6128 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 6129 MachinePointerInfo::getGOT(), false, false, 0); 6130 6131 // The address of the thread local variable is the add of the thread 6132 // pointer with the offset of the variable. 6133 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 6134} 6135 6136SDValue 6137X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 6138 6139 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6140 const GlobalValue *GV = GA->getGlobal(); 6141 6142 if (Subtarget->isTargetELF()) { 6143 // TODO: implement the "local dynamic" model 6144 // TODO: implement the "initial exec"model for pic executables 6145 6146 // If GV is an alias then use the aliasee for determining 6147 // thread-localness. 6148 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 6149 GV = GA->resolveAliasedGlobal(false); 6150 6151 TLSModel::Model model 6152 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 6153 6154 switch (model) { 6155 case TLSModel::GeneralDynamic: 6156 case TLSModel::LocalDynamic: // not implemented 6157 if (Subtarget->is64Bit()) 6158 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 6159 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 6160 6161 case TLSModel::InitialExec: 6162 case TLSModel::LocalExec: 6163 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 6164 Subtarget->is64Bit()); 6165 } 6166 } else if (Subtarget->isTargetDarwin()) { 6167 // Darwin only has one model of TLS. Lower to that. 6168 unsigned char OpFlag = 0; 6169 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 6170 X86ISD::WrapperRIP : X86ISD::Wrapper; 6171 6172 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6173 // global base reg. 6174 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 6175 !Subtarget->is64Bit(); 6176 if (PIC32) 6177 OpFlag = X86II::MO_TLVP_PIC_BASE; 6178 else 6179 OpFlag = X86II::MO_TLVP; 6180 DebugLoc DL = Op.getDebugLoc(); 6181 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 6182 GA->getValueType(0), 6183 GA->getOffset(), OpFlag); 6184 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6185 6186 // With PIC32, the address is actually $g + Offset. 6187 if (PIC32) 6188 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6189 DAG.getNode(X86ISD::GlobalBaseReg, 6190 DebugLoc(), getPointerTy()), 6191 Offset); 6192 6193 // Lowering the machine isd will make sure everything is in the right 6194 // location. 6195 SDValue Chain = DAG.getEntryNode(); 6196 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6197 SDValue Args[] = { Chain, Offset }; 6198 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 6199 6200 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 6201 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6202 MFI->setAdjustsStack(true); 6203 6204 // And our return value (tls address) is in the standard call return value 6205 // location. 6206 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 6207 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 6208 } 6209 6210 assert(false && 6211 "TLS not implemented for this target."); 6212 6213 llvm_unreachable("Unreachable"); 6214 return SDValue(); 6215} 6216 6217 6218/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 6219/// take a 2 x i32 value to shift plus a shift amount. 6220SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 6221 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6222 EVT VT = Op.getValueType(); 6223 unsigned VTBits = VT.getSizeInBits(); 6224 DebugLoc dl = Op.getDebugLoc(); 6225 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 6226 SDValue ShOpLo = Op.getOperand(0); 6227 SDValue ShOpHi = Op.getOperand(1); 6228 SDValue ShAmt = Op.getOperand(2); 6229 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 6230 DAG.getConstant(VTBits - 1, MVT::i8)) 6231 : DAG.getConstant(0, VT); 6232 6233 SDValue Tmp2, Tmp3; 6234 if (Op.getOpcode() == ISD::SHL_PARTS) { 6235 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 6236 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6237 } else { 6238 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 6239 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 6240 } 6241 6242 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 6243 DAG.getConstant(VTBits, MVT::i8)); 6244 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6245 AndNode, DAG.getConstant(0, MVT::i8)); 6246 6247 SDValue Hi, Lo; 6248 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6249 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 6250 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 6251 6252 if (Op.getOpcode() == ISD::SHL_PARTS) { 6253 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6254 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6255 } else { 6256 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6257 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6258 } 6259 6260 SDValue Ops[2] = { Lo, Hi }; 6261 return DAG.getMergeValues(Ops, 2, dl); 6262} 6263 6264SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 6265 SelectionDAG &DAG) const { 6266 EVT SrcVT = Op.getOperand(0).getValueType(); 6267 6268 if (SrcVT.isVector()) 6269 return SDValue(); 6270 6271 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 6272 "Unknown SINT_TO_FP to lower!"); 6273 6274 // These are really Legal; return the operand so the caller accepts it as 6275 // Legal. 6276 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 6277 return Op; 6278 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 6279 Subtarget->is64Bit()) { 6280 return Op; 6281 } 6282 6283 DebugLoc dl = Op.getDebugLoc(); 6284 unsigned Size = SrcVT.getSizeInBits()/8; 6285 MachineFunction &MF = DAG.getMachineFunction(); 6286 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 6287 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6288 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6289 StackSlot, 6290 MachinePointerInfo::getFixedStack(SSFI), 6291 false, false, 0); 6292 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 6293} 6294 6295SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 6296 SDValue StackSlot, 6297 SelectionDAG &DAG) const { 6298 // Build the FILD 6299 DebugLoc DL = Op.getDebugLoc(); 6300 SDVTList Tys; 6301 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 6302 if (useSSE) 6303 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 6304 else 6305 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 6306 6307 unsigned ByteSize = SrcVT.getSizeInBits()/8; 6308 6309 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6310 MachineMemOperand *MMO = 6311 DAG.getMachineFunction() 6312 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6313 MachineMemOperand::MOLoad, ByteSize, ByteSize); 6314 6315 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 6316 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 6317 X86ISD::FILD, DL, 6318 Tys, Ops, array_lengthof(Ops), 6319 SrcVT, MMO); 6320 6321 if (useSSE) { 6322 Chain = Result.getValue(1); 6323 SDValue InFlag = Result.getValue(2); 6324 6325 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 6326 // shouldn't be necessary except that RFP cannot be live across 6327 // multiple blocks. When stackifier is fixed, they can be uncoupled. 6328 MachineFunction &MF = DAG.getMachineFunction(); 6329 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 6330 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 6331 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6332 Tys = DAG.getVTList(MVT::Other); 6333 SDValue Ops[] = { 6334 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 6335 }; 6336 MachineMemOperand *MMO = 6337 DAG.getMachineFunction() 6338 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6339 MachineMemOperand::MOStore, SSFISize, SSFISize); 6340 6341 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 6342 Ops, array_lengthof(Ops), 6343 Op.getValueType(), MMO); 6344 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 6345 MachinePointerInfo::getFixedStack(SSFI), 6346 false, false, 0); 6347 } 6348 6349 return Result; 6350} 6351 6352// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 6353SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 6354 SelectionDAG &DAG) const { 6355 // This algorithm is not obvious. Here it is in C code, more or less: 6356 /* 6357 double uint64_to_double( uint32_t hi, uint32_t lo ) { 6358 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 6359 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 6360 6361 // Copy ints to xmm registers. 6362 __m128i xh = _mm_cvtsi32_si128( hi ); 6363 __m128i xl = _mm_cvtsi32_si128( lo ); 6364 6365 // Combine into low half of a single xmm register. 6366 __m128i x = _mm_unpacklo_epi32( xh, xl ); 6367 __m128d d; 6368 double sd; 6369 6370 // Merge in appropriate exponents to give the integer bits the right 6371 // magnitude. 6372 x = _mm_unpacklo_epi32( x, exp ); 6373 6374 // Subtract away the biases to deal with the IEEE-754 double precision 6375 // implicit 1. 6376 d = _mm_sub_pd( (__m128d) x, bias ); 6377 6378 // All conversions up to here are exact. The correctly rounded result is 6379 // calculated using the current rounding mode using the following 6380 // horizontal add. 6381 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 6382 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 6383 // store doesn't really need to be here (except 6384 // maybe to zero the other double) 6385 return sd; 6386 } 6387 */ 6388 6389 DebugLoc dl = Op.getDebugLoc(); 6390 LLVMContext *Context = DAG.getContext(); 6391 6392 // Build some magic constants. 6393 std::vector<Constant*> CV0; 6394 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 6395 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 6396 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6397 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6398 Constant *C0 = ConstantVector::get(CV0); 6399 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 6400 6401 std::vector<Constant*> CV1; 6402 CV1.push_back( 6403 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 6404 CV1.push_back( 6405 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 6406 Constant *C1 = ConstantVector::get(CV1); 6407 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 6408 6409 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6410 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6411 Op.getOperand(0), 6412 DAG.getIntPtrConstant(1))); 6413 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6414 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6415 Op.getOperand(0), 6416 DAG.getIntPtrConstant(0))); 6417 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 6418 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 6419 MachinePointerInfo::getConstantPool(), 6420 false, false, 16); 6421 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 6422 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 6423 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 6424 MachinePointerInfo::getConstantPool(), 6425 false, false, 16); 6426 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 6427 6428 // Add the halves; easiest way is to swap them into another reg first. 6429 int ShufMask[2] = { 1, -1 }; 6430 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 6431 DAG.getUNDEF(MVT::v2f64), ShufMask); 6432 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 6433 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 6434 DAG.getIntPtrConstant(0)); 6435} 6436 6437// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 6438SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 6439 SelectionDAG &DAG) const { 6440 DebugLoc dl = Op.getDebugLoc(); 6441 // FP constant to bias correct the final result. 6442 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 6443 MVT::f64); 6444 6445 // Load the 32-bit value into an XMM register. 6446 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6447 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6448 Op.getOperand(0), 6449 DAG.getIntPtrConstant(0))); 6450 6451 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6452 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 6453 DAG.getIntPtrConstant(0)); 6454 6455 // Or the load with the bias. 6456 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 6457 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6458 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6459 MVT::v2f64, Load)), 6460 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6461 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6462 MVT::v2f64, Bias))); 6463 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6464 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 6465 DAG.getIntPtrConstant(0)); 6466 6467 // Subtract the bias. 6468 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 6469 6470 // Handle final rounding. 6471 EVT DestVT = Op.getValueType(); 6472 6473 if (DestVT.bitsLT(MVT::f64)) { 6474 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 6475 DAG.getIntPtrConstant(0)); 6476 } else if (DestVT.bitsGT(MVT::f64)) { 6477 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 6478 } 6479 6480 // Handle final rounding. 6481 return Sub; 6482} 6483 6484SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 6485 SelectionDAG &DAG) const { 6486 SDValue N0 = Op.getOperand(0); 6487 DebugLoc dl = Op.getDebugLoc(); 6488 6489 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 6490 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 6491 // the optimization here. 6492 if (DAG.SignBitIsZero(N0)) 6493 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 6494 6495 EVT SrcVT = N0.getValueType(); 6496 EVT DstVT = Op.getValueType(); 6497 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 6498 return LowerUINT_TO_FP_i64(Op, DAG); 6499 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 6500 return LowerUINT_TO_FP_i32(Op, DAG); 6501 6502 // Make a 64-bit buffer, and use it to build an FILD. 6503 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 6504 if (SrcVT == MVT::i32) { 6505 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 6506 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 6507 getPointerTy(), StackSlot, WordOff); 6508 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6509 StackSlot, MachinePointerInfo(), 6510 false, false, 0); 6511 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 6512 OffsetSlot, MachinePointerInfo(), 6513 false, false, 0); 6514 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 6515 return Fild; 6516 } 6517 6518 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 6519 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6520 StackSlot, MachinePointerInfo(), 6521 false, false, 0); 6522 // For i64 source, we need to add the appropriate power of 2 if the input 6523 // was negative. This is the same as the optimization in 6524 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 6525 // we must be careful to do the computation in x87 extended precision, not 6526 // in SSE. (The generic code can't know it's OK to do this, or how to.) 6527 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6528 MachineMemOperand *MMO = 6529 DAG.getMachineFunction() 6530 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6531 MachineMemOperand::MOLoad, 8, 8); 6532 6533 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 6534 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 6535 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 6536 MVT::i64, MMO); 6537 6538 APInt FF(32, 0x5F800000ULL); 6539 6540 // Check whether the sign bit is set. 6541 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 6542 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 6543 ISD::SETLT); 6544 6545 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 6546 SDValue FudgePtr = DAG.getConstantPool( 6547 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 6548 getPointerTy()); 6549 6550 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 6551 SDValue Zero = DAG.getIntPtrConstant(0); 6552 SDValue Four = DAG.getIntPtrConstant(4); 6553 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 6554 Zero, Four); 6555 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 6556 6557 // Load the value out, extending it from f32 to f80. 6558 // FIXME: Avoid the extend by constructing the right constant pool? 6559 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 6560 FudgePtr, MachinePointerInfo::getConstantPool(), 6561 MVT::f32, false, false, 4); 6562 // Extend everything to 80 bits to force it to be done on x87. 6563 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 6564 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 6565} 6566 6567std::pair<SDValue,SDValue> X86TargetLowering:: 6568FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 6569 DebugLoc DL = Op.getDebugLoc(); 6570 6571 EVT DstTy = Op.getValueType(); 6572 6573 if (!IsSigned) { 6574 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 6575 DstTy = MVT::i64; 6576 } 6577 6578 assert(DstTy.getSimpleVT() <= MVT::i64 && 6579 DstTy.getSimpleVT() >= MVT::i16 && 6580 "Unknown FP_TO_SINT to lower!"); 6581 6582 // These are really Legal. 6583 if (DstTy == MVT::i32 && 6584 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6585 return std::make_pair(SDValue(), SDValue()); 6586 if (Subtarget->is64Bit() && 6587 DstTy == MVT::i64 && 6588 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6589 return std::make_pair(SDValue(), SDValue()); 6590 6591 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 6592 // stack slot. 6593 MachineFunction &MF = DAG.getMachineFunction(); 6594 unsigned MemSize = DstTy.getSizeInBits()/8; 6595 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6596 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6597 6598 6599 6600 unsigned Opc; 6601 switch (DstTy.getSimpleVT().SimpleTy) { 6602 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 6603 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 6604 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 6605 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 6606 } 6607 6608 SDValue Chain = DAG.getEntryNode(); 6609 SDValue Value = Op.getOperand(0); 6610 EVT TheVT = Op.getOperand(0).getValueType(); 6611 if (isScalarFPTypeInSSEReg(TheVT)) { 6612 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 6613 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 6614 MachinePointerInfo::getFixedStack(SSFI), 6615 false, false, 0); 6616 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 6617 SDValue Ops[] = { 6618 Chain, StackSlot, DAG.getValueType(TheVT) 6619 }; 6620 6621 MachineMemOperand *MMO = 6622 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6623 MachineMemOperand::MOLoad, MemSize, MemSize); 6624 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 6625 DstTy, MMO); 6626 Chain = Value.getValue(1); 6627 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6628 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6629 } 6630 6631 MachineMemOperand *MMO = 6632 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6633 MachineMemOperand::MOStore, MemSize, MemSize); 6634 6635 // Build the FP_TO_INT*_IN_MEM 6636 SDValue Ops[] = { Chain, Value, StackSlot }; 6637 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 6638 Ops, 3, DstTy, MMO); 6639 6640 return std::make_pair(FIST, StackSlot); 6641} 6642 6643SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 6644 SelectionDAG &DAG) const { 6645 if (Op.getValueType().isVector()) 6646 return SDValue(); 6647 6648 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 6649 SDValue FIST = Vals.first, StackSlot = Vals.second; 6650 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 6651 if (FIST.getNode() == 0) return Op; 6652 6653 // Load the result. 6654 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6655 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 6656} 6657 6658SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 6659 SelectionDAG &DAG) const { 6660 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 6661 SDValue FIST = Vals.first, StackSlot = Vals.second; 6662 assert(FIST.getNode() && "Unexpected failure"); 6663 6664 // Load the result. 6665 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6666 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 6667} 6668 6669SDValue X86TargetLowering::LowerFABS(SDValue Op, 6670 SelectionDAG &DAG) const { 6671 LLVMContext *Context = DAG.getContext(); 6672 DebugLoc dl = Op.getDebugLoc(); 6673 EVT VT = Op.getValueType(); 6674 EVT EltVT = VT; 6675 if (VT.isVector()) 6676 EltVT = VT.getVectorElementType(); 6677 std::vector<Constant*> CV; 6678 if (EltVT == MVT::f64) { 6679 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 6680 CV.push_back(C); 6681 CV.push_back(C); 6682 } else { 6683 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 6684 CV.push_back(C); 6685 CV.push_back(C); 6686 CV.push_back(C); 6687 CV.push_back(C); 6688 } 6689 Constant *C = ConstantVector::get(CV); 6690 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6691 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6692 MachinePointerInfo::getConstantPool(), 6693 false, false, 16); 6694 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 6695} 6696 6697SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 6698 LLVMContext *Context = DAG.getContext(); 6699 DebugLoc dl = Op.getDebugLoc(); 6700 EVT VT = Op.getValueType(); 6701 EVT EltVT = VT; 6702 if (VT.isVector()) 6703 EltVT = VT.getVectorElementType(); 6704 std::vector<Constant*> CV; 6705 if (EltVT == MVT::f64) { 6706 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6707 CV.push_back(C); 6708 CV.push_back(C); 6709 } else { 6710 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6711 CV.push_back(C); 6712 CV.push_back(C); 6713 CV.push_back(C); 6714 CV.push_back(C); 6715 } 6716 Constant *C = ConstantVector::get(CV); 6717 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6718 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6719 MachinePointerInfo::getConstantPool(), 6720 false, false, 16); 6721 if (VT.isVector()) { 6722 return DAG.getNode(ISD::BITCAST, dl, VT, 6723 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6724 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6725 Op.getOperand(0)), 6726 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); 6727 } else { 6728 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6729 } 6730} 6731 6732SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6733 LLVMContext *Context = DAG.getContext(); 6734 SDValue Op0 = Op.getOperand(0); 6735 SDValue Op1 = Op.getOperand(1); 6736 DebugLoc dl = Op.getDebugLoc(); 6737 EVT VT = Op.getValueType(); 6738 EVT SrcVT = Op1.getValueType(); 6739 6740 // If second operand is smaller, extend it first. 6741 if (SrcVT.bitsLT(VT)) { 6742 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6743 SrcVT = VT; 6744 } 6745 // And if it is bigger, shrink it first. 6746 if (SrcVT.bitsGT(VT)) { 6747 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6748 SrcVT = VT; 6749 } 6750 6751 // At this point the operands and the result should have the same 6752 // type, and that won't be f80 since that is not custom lowered. 6753 6754 // First get the sign bit of second operand. 6755 std::vector<Constant*> CV; 6756 if (SrcVT == MVT::f64) { 6757 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6758 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6759 } else { 6760 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6761 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6762 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6763 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6764 } 6765 Constant *C = ConstantVector::get(CV); 6766 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6767 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6768 MachinePointerInfo::getConstantPool(), 6769 false, false, 16); 6770 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6771 6772 // Shift sign bit right or left if the two operands have different types. 6773 if (SrcVT.bitsGT(VT)) { 6774 // Op0 is MVT::f32, Op1 is MVT::f64. 6775 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6776 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6777 DAG.getConstant(32, MVT::i32)); 6778 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 6779 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6780 DAG.getIntPtrConstant(0)); 6781 } 6782 6783 // Clear first operand sign bit. 6784 CV.clear(); 6785 if (VT == MVT::f64) { 6786 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6787 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6788 } else { 6789 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6790 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6791 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6792 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6793 } 6794 C = ConstantVector::get(CV); 6795 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6796 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6797 MachinePointerInfo::getConstantPool(), 6798 false, false, 16); 6799 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6800 6801 // Or the value with the sign bit. 6802 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6803} 6804 6805/// Emit nodes that will be selected as "test Op0,Op0", or something 6806/// equivalent. 6807SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6808 SelectionDAG &DAG) const { 6809 DebugLoc dl = Op.getDebugLoc(); 6810 6811 // CF and OF aren't always set the way we want. Determine which 6812 // of these we need. 6813 bool NeedCF = false; 6814 bool NeedOF = false; 6815 switch (X86CC) { 6816 default: break; 6817 case X86::COND_A: case X86::COND_AE: 6818 case X86::COND_B: case X86::COND_BE: 6819 NeedCF = true; 6820 break; 6821 case X86::COND_G: case X86::COND_GE: 6822 case X86::COND_L: case X86::COND_LE: 6823 case X86::COND_O: case X86::COND_NO: 6824 NeedOF = true; 6825 break; 6826 } 6827 6828 // See if we can use the EFLAGS value from the operand instead of 6829 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6830 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6831 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6832 // Emit a CMP with 0, which is the TEST pattern. 6833 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6834 DAG.getConstant(0, Op.getValueType())); 6835 6836 unsigned Opcode = 0; 6837 unsigned NumOperands = 0; 6838 switch (Op.getNode()->getOpcode()) { 6839 case ISD::ADD: 6840 // Due to an isel shortcoming, be conservative if this add is likely to be 6841 // selected as part of a load-modify-store instruction. When the root node 6842 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6843 // uses of other nodes in the match, such as the ADD in this case. This 6844 // leads to the ADD being left around and reselected, with the result being 6845 // two adds in the output. Alas, even if none our users are stores, that 6846 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6847 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6848 // climbing the DAG back to the root, and it doesn't seem to be worth the 6849 // effort. 6850 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6851 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6852 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6853 goto default_case; 6854 6855 if (ConstantSDNode *C = 6856 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6857 // An add of one will be selected as an INC. 6858 if (C->getAPIntValue() == 1) { 6859 Opcode = X86ISD::INC; 6860 NumOperands = 1; 6861 break; 6862 } 6863 6864 // An add of negative one (subtract of one) will be selected as a DEC. 6865 if (C->getAPIntValue().isAllOnesValue()) { 6866 Opcode = X86ISD::DEC; 6867 NumOperands = 1; 6868 break; 6869 } 6870 } 6871 6872 // Otherwise use a regular EFLAGS-setting add. 6873 Opcode = X86ISD::ADD; 6874 NumOperands = 2; 6875 break; 6876 case ISD::AND: { 6877 // If the primary and result isn't used, don't bother using X86ISD::AND, 6878 // because a TEST instruction will be better. 6879 bool NonFlagUse = false; 6880 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6881 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6882 SDNode *User = *UI; 6883 unsigned UOpNo = UI.getOperandNo(); 6884 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6885 // Look pass truncate. 6886 UOpNo = User->use_begin().getOperandNo(); 6887 User = *User->use_begin(); 6888 } 6889 6890 if (User->getOpcode() != ISD::BRCOND && 6891 User->getOpcode() != ISD::SETCC && 6892 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6893 NonFlagUse = true; 6894 break; 6895 } 6896 } 6897 6898 if (!NonFlagUse) 6899 break; 6900 } 6901 // FALL THROUGH 6902 case ISD::SUB: 6903 case ISD::OR: 6904 case ISD::XOR: 6905 // Due to the ISEL shortcoming noted above, be conservative if this op is 6906 // likely to be selected as part of a load-modify-store instruction. 6907 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6908 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6909 if (UI->getOpcode() == ISD::STORE) 6910 goto default_case; 6911 6912 // Otherwise use a regular EFLAGS-setting instruction. 6913 switch (Op.getNode()->getOpcode()) { 6914 default: llvm_unreachable("unexpected operator!"); 6915 case ISD::SUB: Opcode = X86ISD::SUB; break; 6916 case ISD::OR: Opcode = X86ISD::OR; break; 6917 case ISD::XOR: Opcode = X86ISD::XOR; break; 6918 case ISD::AND: Opcode = X86ISD::AND; break; 6919 } 6920 6921 NumOperands = 2; 6922 break; 6923 case X86ISD::ADD: 6924 case X86ISD::SUB: 6925 case X86ISD::INC: 6926 case X86ISD::DEC: 6927 case X86ISD::OR: 6928 case X86ISD::XOR: 6929 case X86ISD::AND: 6930 return SDValue(Op.getNode(), 1); 6931 default: 6932 default_case: 6933 break; 6934 } 6935 6936 if (Opcode == 0) 6937 // Emit a CMP with 0, which is the TEST pattern. 6938 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6939 DAG.getConstant(0, Op.getValueType())); 6940 6941 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6942 SmallVector<SDValue, 4> Ops; 6943 for (unsigned i = 0; i != NumOperands; ++i) 6944 Ops.push_back(Op.getOperand(i)); 6945 6946 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6947 DAG.ReplaceAllUsesWith(Op, New); 6948 return SDValue(New.getNode(), 1); 6949} 6950 6951/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6952/// equivalent. 6953SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6954 SelectionDAG &DAG) const { 6955 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6956 if (C->getAPIntValue() == 0) 6957 return EmitTest(Op0, X86CC, DAG); 6958 6959 DebugLoc dl = Op0.getDebugLoc(); 6960 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6961} 6962 6963/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6964/// if it's possible. 6965SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6966 DebugLoc dl, SelectionDAG &DAG) const { 6967 SDValue Op0 = And.getOperand(0); 6968 SDValue Op1 = And.getOperand(1); 6969 if (Op0.getOpcode() == ISD::TRUNCATE) 6970 Op0 = Op0.getOperand(0); 6971 if (Op1.getOpcode() == ISD::TRUNCATE) 6972 Op1 = Op1.getOperand(0); 6973 6974 SDValue LHS, RHS; 6975 if (Op1.getOpcode() == ISD::SHL) 6976 std::swap(Op0, Op1); 6977 if (Op0.getOpcode() == ISD::SHL) { 6978 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6979 if (And00C->getZExtValue() == 1) { 6980 // If we looked past a truncate, check that it's only truncating away 6981 // known zeros. 6982 unsigned BitWidth = Op0.getValueSizeInBits(); 6983 unsigned AndBitWidth = And.getValueSizeInBits(); 6984 if (BitWidth > AndBitWidth) { 6985 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6986 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6987 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6988 return SDValue(); 6989 } 6990 LHS = Op1; 6991 RHS = Op0.getOperand(1); 6992 } 6993 } else if (Op1.getOpcode() == ISD::Constant) { 6994 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6995 SDValue AndLHS = Op0; 6996 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6997 LHS = AndLHS.getOperand(0); 6998 RHS = AndLHS.getOperand(1); 6999 } 7000 } 7001 7002 if (LHS.getNode()) { 7003 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 7004 // instruction. Since the shift amount is in-range-or-undefined, we know 7005 // that doing a bittest on the i32 value is ok. We extend to i32 because 7006 // the encoding for the i16 version is larger than the i32 version. 7007 // Also promote i16 to i32 for performance / code size reason. 7008 if (LHS.getValueType() == MVT::i8 || 7009 LHS.getValueType() == MVT::i16) 7010 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 7011 7012 // If the operand types disagree, extend the shift amount to match. Since 7013 // BT ignores high bits (like shifts) we can use anyextend. 7014 if (LHS.getValueType() != RHS.getValueType()) 7015 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 7016 7017 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 7018 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 7019 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7020 DAG.getConstant(Cond, MVT::i8), BT); 7021 } 7022 7023 return SDValue(); 7024} 7025 7026SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 7027 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 7028 SDValue Op0 = Op.getOperand(0); 7029 SDValue Op1 = Op.getOperand(1); 7030 DebugLoc dl = Op.getDebugLoc(); 7031 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7032 7033 // Optimize to BT if possible. 7034 // Lower (X & (1 << N)) == 0 to BT(X, N). 7035 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 7036 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 7037 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 7038 Op1.getOpcode() == ISD::Constant && 7039 cast<ConstantSDNode>(Op1)->isNullValue() && 7040 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7041 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 7042 if (NewSetCC.getNode()) 7043 return NewSetCC; 7044 } 7045 7046 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 7047 // these. 7048 if (Op1.getOpcode() == ISD::Constant && 7049 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 7050 cast<ConstantSDNode>(Op1)->isNullValue()) && 7051 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7052 7053 // If the input is a setcc, then reuse the input setcc or use a new one with 7054 // the inverted condition. 7055 if (Op0.getOpcode() == X86ISD::SETCC) { 7056 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 7057 bool Invert = (CC == ISD::SETNE) ^ 7058 cast<ConstantSDNode>(Op1)->isNullValue(); 7059 if (!Invert) return Op0; 7060 7061 CCode = X86::GetOppositeBranchCondition(CCode); 7062 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7063 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 7064 } 7065 } 7066 7067 bool isFP = Op1.getValueType().isFloatingPoint(); 7068 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 7069 if (X86CC == X86::COND_INVALID) 7070 return SDValue(); 7071 7072 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 7073 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7074 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 7075} 7076 7077SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 7078 SDValue Cond; 7079 SDValue Op0 = Op.getOperand(0); 7080 SDValue Op1 = Op.getOperand(1); 7081 SDValue CC = Op.getOperand(2); 7082 EVT VT = Op.getValueType(); 7083 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 7084 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 7085 DebugLoc dl = Op.getDebugLoc(); 7086 7087 if (isFP) { 7088 unsigned SSECC = 8; 7089 EVT VT0 = Op0.getValueType(); 7090 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 7091 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 7092 bool Swap = false; 7093 7094 switch (SetCCOpcode) { 7095 default: break; 7096 case ISD::SETOEQ: 7097 case ISD::SETEQ: SSECC = 0; break; 7098 case ISD::SETOGT: 7099 case ISD::SETGT: Swap = true; // Fallthrough 7100 case ISD::SETLT: 7101 case ISD::SETOLT: SSECC = 1; break; 7102 case ISD::SETOGE: 7103 case ISD::SETGE: Swap = true; // Fallthrough 7104 case ISD::SETLE: 7105 case ISD::SETOLE: SSECC = 2; break; 7106 case ISD::SETUO: SSECC = 3; break; 7107 case ISD::SETUNE: 7108 case ISD::SETNE: SSECC = 4; break; 7109 case ISD::SETULE: Swap = true; 7110 case ISD::SETUGE: SSECC = 5; break; 7111 case ISD::SETULT: Swap = true; 7112 case ISD::SETUGT: SSECC = 6; break; 7113 case ISD::SETO: SSECC = 7; break; 7114 } 7115 if (Swap) 7116 std::swap(Op0, Op1); 7117 7118 // In the two special cases we can't handle, emit two comparisons. 7119 if (SSECC == 8) { 7120 if (SetCCOpcode == ISD::SETUEQ) { 7121 SDValue UNORD, EQ; 7122 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 7123 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 7124 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 7125 } 7126 else if (SetCCOpcode == ISD::SETONE) { 7127 SDValue ORD, NEQ; 7128 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 7129 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 7130 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 7131 } 7132 llvm_unreachable("Illegal FP comparison"); 7133 } 7134 // Handle all other FP comparisons here. 7135 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 7136 } 7137 7138 // We are handling one of the integer comparisons here. Since SSE only has 7139 // GT and EQ comparisons for integer, swapping operands and multiple 7140 // operations may be required for some comparisons. 7141 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 7142 bool Swap = false, Invert = false, FlipSigns = false; 7143 7144 switch (VT.getSimpleVT().SimpleTy) { 7145 default: break; 7146 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 7147 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 7148 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 7149 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 7150 } 7151 7152 switch (SetCCOpcode) { 7153 default: break; 7154 case ISD::SETNE: Invert = true; 7155 case ISD::SETEQ: Opc = EQOpc; break; 7156 case ISD::SETLT: Swap = true; 7157 case ISD::SETGT: Opc = GTOpc; break; 7158 case ISD::SETGE: Swap = true; 7159 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 7160 case ISD::SETULT: Swap = true; 7161 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 7162 case ISD::SETUGE: Swap = true; 7163 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 7164 } 7165 if (Swap) 7166 std::swap(Op0, Op1); 7167 7168 // Since SSE has no unsigned integer comparisons, we need to flip the sign 7169 // bits of the inputs before performing those operations. 7170 if (FlipSigns) { 7171 EVT EltVT = VT.getVectorElementType(); 7172 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 7173 EltVT); 7174 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 7175 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 7176 SignBits.size()); 7177 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 7178 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 7179 } 7180 7181 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 7182 7183 // If the logical-not of the result is required, perform that now. 7184 if (Invert) 7185 Result = DAG.getNOT(dl, Result, VT); 7186 7187 return Result; 7188} 7189 7190// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 7191static bool isX86LogicalCmp(SDValue Op) { 7192 unsigned Opc = Op.getNode()->getOpcode(); 7193 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 7194 return true; 7195 if (Op.getResNo() == 1 && 7196 (Opc == X86ISD::ADD || 7197 Opc == X86ISD::SUB || 7198 Opc == X86ISD::ADC || 7199 Opc == X86ISD::SBB || 7200 Opc == X86ISD::SMUL || 7201 Opc == X86ISD::UMUL || 7202 Opc == X86ISD::INC || 7203 Opc == X86ISD::DEC || 7204 Opc == X86ISD::OR || 7205 Opc == X86ISD::XOR || 7206 Opc == X86ISD::AND)) 7207 return true; 7208 7209 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 7210 return true; 7211 7212 return false; 7213} 7214 7215static bool isZero(SDValue V) { 7216 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7217 return C && C->isNullValue(); 7218} 7219 7220static bool isAllOnes(SDValue V) { 7221 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7222 return C && C->isAllOnesValue(); 7223} 7224 7225SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 7226 bool addTest = true; 7227 SDValue Cond = Op.getOperand(0); 7228 SDValue Op1 = Op.getOperand(1); 7229 SDValue Op2 = Op.getOperand(2); 7230 DebugLoc DL = Op.getDebugLoc(); 7231 SDValue CC; 7232 7233 if (Cond.getOpcode() == ISD::SETCC) { 7234 SDValue NewCond = LowerSETCC(Cond, DAG); 7235 if (NewCond.getNode()) 7236 Cond = NewCond; 7237 } 7238 7239 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 7240 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 7241 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 7242 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 7243 if (Cond.getOpcode() == X86ISD::SETCC && 7244 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 7245 isZero(Cond.getOperand(1).getOperand(1))) { 7246 SDValue Cmp = Cond.getOperand(1); 7247 7248 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 7249 7250 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 7251 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 7252 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 7253 7254 SDValue CmpOp0 = Cmp.getOperand(0); 7255 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 7256 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 7257 7258 SDValue Res = // Res = 0 or -1. 7259 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7260 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 7261 7262 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 7263 Res = DAG.getNOT(DL, Res, Res.getValueType()); 7264 7265 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 7266 if (N2C == 0 || !N2C->isNullValue()) 7267 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 7268 return Res; 7269 } 7270 } 7271 7272 // Look past (and (setcc_carry (cmp ...)), 1). 7273 if (Cond.getOpcode() == ISD::AND && 7274 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7275 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7276 if (C && C->getAPIntValue() == 1) 7277 Cond = Cond.getOperand(0); 7278 } 7279 7280 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7281 // setting operand in place of the X86ISD::SETCC. 7282 if (Cond.getOpcode() == X86ISD::SETCC || 7283 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7284 CC = Cond.getOperand(0); 7285 7286 SDValue Cmp = Cond.getOperand(1); 7287 unsigned Opc = Cmp.getOpcode(); 7288 EVT VT = Op.getValueType(); 7289 7290 bool IllegalFPCMov = false; 7291 if (VT.isFloatingPoint() && !VT.isVector() && 7292 !isScalarFPTypeInSSEReg(VT)) // FPStack? 7293 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 7294 7295 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 7296 Opc == X86ISD::BT) { // FIXME 7297 Cond = Cmp; 7298 addTest = false; 7299 } 7300 } 7301 7302 if (addTest) { 7303 // Look pass the truncate. 7304 if (Cond.getOpcode() == ISD::TRUNCATE) 7305 Cond = Cond.getOperand(0); 7306 7307 // We know the result of AND is compared against zero. Try to match 7308 // it to BT. 7309 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7310 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 7311 if (NewSetCC.getNode()) { 7312 CC = NewSetCC.getOperand(0); 7313 Cond = NewSetCC.getOperand(1); 7314 addTest = false; 7315 } 7316 } 7317 } 7318 7319 if (addTest) { 7320 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7321 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7322 } 7323 7324 // a < b ? -1 : 0 -> RES = ~setcc_carry 7325 // a < b ? 0 : -1 -> RES = setcc_carry 7326 // a >= b ? -1 : 0 -> RES = setcc_carry 7327 // a >= b ? 0 : -1 -> RES = ~setcc_carry 7328 if (Cond.getOpcode() == X86ISD::CMP) { 7329 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 7330 7331 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 7332 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 7333 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7334 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 7335 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 7336 return DAG.getNOT(DL, Res, Res.getValueType()); 7337 return Res; 7338 } 7339 } 7340 7341 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 7342 // condition is true. 7343 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 7344 SDValue Ops[] = { Op2, Op1, CC, Cond }; 7345 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 7346} 7347 7348// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 7349// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 7350// from the AND / OR. 7351static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 7352 Opc = Op.getOpcode(); 7353 if (Opc != ISD::OR && Opc != ISD::AND) 7354 return false; 7355 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7356 Op.getOperand(0).hasOneUse() && 7357 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 7358 Op.getOperand(1).hasOneUse()); 7359} 7360 7361// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 7362// 1 and that the SETCC node has a single use. 7363static bool isXor1OfSetCC(SDValue Op) { 7364 if (Op.getOpcode() != ISD::XOR) 7365 return false; 7366 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7367 if (N1C && N1C->getAPIntValue() == 1) { 7368 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7369 Op.getOperand(0).hasOneUse(); 7370 } 7371 return false; 7372} 7373 7374SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 7375 bool addTest = true; 7376 SDValue Chain = Op.getOperand(0); 7377 SDValue Cond = Op.getOperand(1); 7378 SDValue Dest = Op.getOperand(2); 7379 DebugLoc dl = Op.getDebugLoc(); 7380 SDValue CC; 7381 7382 if (Cond.getOpcode() == ISD::SETCC) { 7383 SDValue NewCond = LowerSETCC(Cond, DAG); 7384 if (NewCond.getNode()) 7385 Cond = NewCond; 7386 } 7387#if 0 7388 // FIXME: LowerXALUO doesn't handle these!! 7389 else if (Cond.getOpcode() == X86ISD::ADD || 7390 Cond.getOpcode() == X86ISD::SUB || 7391 Cond.getOpcode() == X86ISD::SMUL || 7392 Cond.getOpcode() == X86ISD::UMUL) 7393 Cond = LowerXALUO(Cond, DAG); 7394#endif 7395 7396 // Look pass (and (setcc_carry (cmp ...)), 1). 7397 if (Cond.getOpcode() == ISD::AND && 7398 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7399 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7400 if (C && C->getAPIntValue() == 1) 7401 Cond = Cond.getOperand(0); 7402 } 7403 7404 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7405 // setting operand in place of the X86ISD::SETCC. 7406 if (Cond.getOpcode() == X86ISD::SETCC || 7407 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7408 CC = Cond.getOperand(0); 7409 7410 SDValue Cmp = Cond.getOperand(1); 7411 unsigned Opc = Cmp.getOpcode(); 7412 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 7413 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 7414 Cond = Cmp; 7415 addTest = false; 7416 } else { 7417 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 7418 default: break; 7419 case X86::COND_O: 7420 case X86::COND_B: 7421 // These can only come from an arithmetic instruction with overflow, 7422 // e.g. SADDO, UADDO. 7423 Cond = Cond.getNode()->getOperand(1); 7424 addTest = false; 7425 break; 7426 } 7427 } 7428 } else { 7429 unsigned CondOpc; 7430 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 7431 SDValue Cmp = Cond.getOperand(0).getOperand(1); 7432 if (CondOpc == ISD::OR) { 7433 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 7434 // two branches instead of an explicit OR instruction with a 7435 // separate test. 7436 if (Cmp == Cond.getOperand(1).getOperand(1) && 7437 isX86LogicalCmp(Cmp)) { 7438 CC = Cond.getOperand(0).getOperand(0); 7439 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7440 Chain, Dest, CC, Cmp); 7441 CC = Cond.getOperand(1).getOperand(0); 7442 Cond = Cmp; 7443 addTest = false; 7444 } 7445 } else { // ISD::AND 7446 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 7447 // two branches instead of an explicit AND instruction with a 7448 // separate test. However, we only do this if this block doesn't 7449 // have a fall-through edge, because this requires an explicit 7450 // jmp when the condition is false. 7451 if (Cmp == Cond.getOperand(1).getOperand(1) && 7452 isX86LogicalCmp(Cmp) && 7453 Op.getNode()->hasOneUse()) { 7454 X86::CondCode CCode = 7455 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7456 CCode = X86::GetOppositeBranchCondition(CCode); 7457 CC = DAG.getConstant(CCode, MVT::i8); 7458 SDNode *User = *Op.getNode()->use_begin(); 7459 // Look for an unconditional branch following this conditional branch. 7460 // We need this because we need to reverse the successors in order 7461 // to implement FCMP_OEQ. 7462 if (User->getOpcode() == ISD::BR) { 7463 SDValue FalseBB = User->getOperand(1); 7464 SDNode *NewBR = 7465 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 7466 assert(NewBR == User); 7467 (void)NewBR; 7468 Dest = FalseBB; 7469 7470 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7471 Chain, Dest, CC, Cmp); 7472 X86::CondCode CCode = 7473 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 7474 CCode = X86::GetOppositeBranchCondition(CCode); 7475 CC = DAG.getConstant(CCode, MVT::i8); 7476 Cond = Cmp; 7477 addTest = false; 7478 } 7479 } 7480 } 7481 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 7482 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 7483 // It should be transformed during dag combiner except when the condition 7484 // is set by a arithmetics with overflow node. 7485 X86::CondCode CCode = 7486 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7487 CCode = X86::GetOppositeBranchCondition(CCode); 7488 CC = DAG.getConstant(CCode, MVT::i8); 7489 Cond = Cond.getOperand(0).getOperand(1); 7490 addTest = false; 7491 } 7492 } 7493 7494 if (addTest) { 7495 // Look pass the truncate. 7496 if (Cond.getOpcode() == ISD::TRUNCATE) 7497 Cond = Cond.getOperand(0); 7498 7499 // We know the result of AND is compared against zero. Try to match 7500 // it to BT. 7501 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7502 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7503 if (NewSetCC.getNode()) { 7504 CC = NewSetCC.getOperand(0); 7505 Cond = NewSetCC.getOperand(1); 7506 addTest = false; 7507 } 7508 } 7509 } 7510 7511 if (addTest) { 7512 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7513 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7514 } 7515 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7516 Chain, Dest, CC, Cond); 7517} 7518 7519 7520// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 7521// Calls to _alloca is needed to probe the stack when allocating more than 4k 7522// bytes in one go. Touching the stack at 4K increments is necessary to ensure 7523// that the guard pages used by the OS virtual memory manager are allocated in 7524// correct sequence. 7525SDValue 7526X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 7527 SelectionDAG &DAG) const { 7528 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 7529 "This should be used only on Windows targets"); 7530 DebugLoc dl = Op.getDebugLoc(); 7531 7532 // Get the inputs. 7533 SDValue Chain = Op.getOperand(0); 7534 SDValue Size = Op.getOperand(1); 7535 // FIXME: Ensure alignment here 7536 7537 SDValue Flag; 7538 7539 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 7540 7541 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 7542 Flag = Chain.getValue(1); 7543 7544 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7545 7546 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 7547 Flag = Chain.getValue(1); 7548 7549 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 7550 7551 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 7552 return DAG.getMergeValues(Ops1, 2, dl); 7553} 7554 7555SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 7556 MachineFunction &MF = DAG.getMachineFunction(); 7557 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 7558 7559 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7560 DebugLoc DL = Op.getDebugLoc(); 7561 7562 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 7563 // vastart just stores the address of the VarArgsFrameIndex slot into the 7564 // memory location argument. 7565 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7566 getPointerTy()); 7567 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 7568 MachinePointerInfo(SV), false, false, 0); 7569 } 7570 7571 // __va_list_tag: 7572 // gp_offset (0 - 6 * 8) 7573 // fp_offset (48 - 48 + 8 * 16) 7574 // overflow_arg_area (point to parameters coming in memory). 7575 // reg_save_area 7576 SmallVector<SDValue, 8> MemOps; 7577 SDValue FIN = Op.getOperand(1); 7578 // Store gp_offset 7579 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 7580 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 7581 MVT::i32), 7582 FIN, MachinePointerInfo(SV), false, false, 0); 7583 MemOps.push_back(Store); 7584 7585 // Store fp_offset 7586 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7587 FIN, DAG.getIntPtrConstant(4)); 7588 Store = DAG.getStore(Op.getOperand(0), DL, 7589 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 7590 MVT::i32), 7591 FIN, MachinePointerInfo(SV, 4), false, false, 0); 7592 MemOps.push_back(Store); 7593 7594 // Store ptr to overflow_arg_area 7595 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7596 FIN, DAG.getIntPtrConstant(4)); 7597 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7598 getPointerTy()); 7599 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 7600 MachinePointerInfo(SV, 8), 7601 false, false, 0); 7602 MemOps.push_back(Store); 7603 7604 // Store ptr to reg_save_area. 7605 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7606 FIN, DAG.getIntPtrConstant(8)); 7607 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 7608 getPointerTy()); 7609 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 7610 MachinePointerInfo(SV, 16), false, false, 0); 7611 MemOps.push_back(Store); 7612 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 7613 &MemOps[0], MemOps.size()); 7614} 7615 7616SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 7617 assert(Subtarget->is64Bit() && 7618 "LowerVAARG only handles 64-bit va_arg!"); 7619 assert((Subtarget->isTargetLinux() || 7620 Subtarget->isTargetDarwin()) && 7621 "Unhandled target in LowerVAARG"); 7622 assert(Op.getNode()->getNumOperands() == 4); 7623 SDValue Chain = Op.getOperand(0); 7624 SDValue SrcPtr = Op.getOperand(1); 7625 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7626 unsigned Align = Op.getConstantOperandVal(3); 7627 DebugLoc dl = Op.getDebugLoc(); 7628 7629 EVT ArgVT = Op.getNode()->getValueType(0); 7630 const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 7631 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 7632 uint8_t ArgMode; 7633 7634 // Decide which area this value should be read from. 7635 // TODO: Implement the AMD64 ABI in its entirety. This simple 7636 // selection mechanism works only for the basic types. 7637 if (ArgVT == MVT::f80) { 7638 llvm_unreachable("va_arg for f80 not yet implemented"); 7639 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 7640 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 7641 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 7642 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 7643 } else { 7644 llvm_unreachable("Unhandled argument type in LowerVAARG"); 7645 } 7646 7647 if (ArgMode == 2) { 7648 // Sanity Check: Make sure using fp_offset makes sense. 7649 assert(!UseSoftFloat && 7650 !(DAG.getMachineFunction() 7651 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 7652 Subtarget->hasXMM()); 7653 } 7654 7655 // Insert VAARG_64 node into the DAG 7656 // VAARG_64 returns two values: Variable Argument Address, Chain 7657 SmallVector<SDValue, 11> InstOps; 7658 InstOps.push_back(Chain); 7659 InstOps.push_back(SrcPtr); 7660 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 7661 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 7662 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 7663 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 7664 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 7665 VTs, &InstOps[0], InstOps.size(), 7666 MVT::i64, 7667 MachinePointerInfo(SV), 7668 /*Align=*/0, 7669 /*Volatile=*/false, 7670 /*ReadMem=*/true, 7671 /*WriteMem=*/true); 7672 Chain = VAARG.getValue(1); 7673 7674 // Load the next argument and return it 7675 return DAG.getLoad(ArgVT, dl, 7676 Chain, 7677 VAARG, 7678 MachinePointerInfo(), 7679 false, false, 0); 7680} 7681 7682SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 7683 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 7684 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 7685 SDValue Chain = Op.getOperand(0); 7686 SDValue DstPtr = Op.getOperand(1); 7687 SDValue SrcPtr = Op.getOperand(2); 7688 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 7689 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7690 DebugLoc DL = Op.getDebugLoc(); 7691 7692 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 7693 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 7694 false, 7695 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 7696} 7697 7698SDValue 7699X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 7700 DebugLoc dl = Op.getDebugLoc(); 7701 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7702 switch (IntNo) { 7703 default: return SDValue(); // Don't custom lower most intrinsics. 7704 // Comparison intrinsics. 7705 case Intrinsic::x86_sse_comieq_ss: 7706 case Intrinsic::x86_sse_comilt_ss: 7707 case Intrinsic::x86_sse_comile_ss: 7708 case Intrinsic::x86_sse_comigt_ss: 7709 case Intrinsic::x86_sse_comige_ss: 7710 case Intrinsic::x86_sse_comineq_ss: 7711 case Intrinsic::x86_sse_ucomieq_ss: 7712 case Intrinsic::x86_sse_ucomilt_ss: 7713 case Intrinsic::x86_sse_ucomile_ss: 7714 case Intrinsic::x86_sse_ucomigt_ss: 7715 case Intrinsic::x86_sse_ucomige_ss: 7716 case Intrinsic::x86_sse_ucomineq_ss: 7717 case Intrinsic::x86_sse2_comieq_sd: 7718 case Intrinsic::x86_sse2_comilt_sd: 7719 case Intrinsic::x86_sse2_comile_sd: 7720 case Intrinsic::x86_sse2_comigt_sd: 7721 case Intrinsic::x86_sse2_comige_sd: 7722 case Intrinsic::x86_sse2_comineq_sd: 7723 case Intrinsic::x86_sse2_ucomieq_sd: 7724 case Intrinsic::x86_sse2_ucomilt_sd: 7725 case Intrinsic::x86_sse2_ucomile_sd: 7726 case Intrinsic::x86_sse2_ucomigt_sd: 7727 case Intrinsic::x86_sse2_ucomige_sd: 7728 case Intrinsic::x86_sse2_ucomineq_sd: { 7729 unsigned Opc = 0; 7730 ISD::CondCode CC = ISD::SETCC_INVALID; 7731 switch (IntNo) { 7732 default: break; 7733 case Intrinsic::x86_sse_comieq_ss: 7734 case Intrinsic::x86_sse2_comieq_sd: 7735 Opc = X86ISD::COMI; 7736 CC = ISD::SETEQ; 7737 break; 7738 case Intrinsic::x86_sse_comilt_ss: 7739 case Intrinsic::x86_sse2_comilt_sd: 7740 Opc = X86ISD::COMI; 7741 CC = ISD::SETLT; 7742 break; 7743 case Intrinsic::x86_sse_comile_ss: 7744 case Intrinsic::x86_sse2_comile_sd: 7745 Opc = X86ISD::COMI; 7746 CC = ISD::SETLE; 7747 break; 7748 case Intrinsic::x86_sse_comigt_ss: 7749 case Intrinsic::x86_sse2_comigt_sd: 7750 Opc = X86ISD::COMI; 7751 CC = ISD::SETGT; 7752 break; 7753 case Intrinsic::x86_sse_comige_ss: 7754 case Intrinsic::x86_sse2_comige_sd: 7755 Opc = X86ISD::COMI; 7756 CC = ISD::SETGE; 7757 break; 7758 case Intrinsic::x86_sse_comineq_ss: 7759 case Intrinsic::x86_sse2_comineq_sd: 7760 Opc = X86ISD::COMI; 7761 CC = ISD::SETNE; 7762 break; 7763 case Intrinsic::x86_sse_ucomieq_ss: 7764 case Intrinsic::x86_sse2_ucomieq_sd: 7765 Opc = X86ISD::UCOMI; 7766 CC = ISD::SETEQ; 7767 break; 7768 case Intrinsic::x86_sse_ucomilt_ss: 7769 case Intrinsic::x86_sse2_ucomilt_sd: 7770 Opc = X86ISD::UCOMI; 7771 CC = ISD::SETLT; 7772 break; 7773 case Intrinsic::x86_sse_ucomile_ss: 7774 case Intrinsic::x86_sse2_ucomile_sd: 7775 Opc = X86ISD::UCOMI; 7776 CC = ISD::SETLE; 7777 break; 7778 case Intrinsic::x86_sse_ucomigt_ss: 7779 case Intrinsic::x86_sse2_ucomigt_sd: 7780 Opc = X86ISD::UCOMI; 7781 CC = ISD::SETGT; 7782 break; 7783 case Intrinsic::x86_sse_ucomige_ss: 7784 case Intrinsic::x86_sse2_ucomige_sd: 7785 Opc = X86ISD::UCOMI; 7786 CC = ISD::SETGE; 7787 break; 7788 case Intrinsic::x86_sse_ucomineq_ss: 7789 case Intrinsic::x86_sse2_ucomineq_sd: 7790 Opc = X86ISD::UCOMI; 7791 CC = ISD::SETNE; 7792 break; 7793 } 7794 7795 SDValue LHS = Op.getOperand(1); 7796 SDValue RHS = Op.getOperand(2); 7797 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 7798 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 7799 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 7800 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7801 DAG.getConstant(X86CC, MVT::i8), Cond); 7802 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7803 } 7804 // ptest and testp intrinsics. The intrinsic these come from are designed to 7805 // return an integer value, not just an instruction so lower it to the ptest 7806 // or testp pattern and a setcc for the result. 7807 case Intrinsic::x86_sse41_ptestz: 7808 case Intrinsic::x86_sse41_ptestc: 7809 case Intrinsic::x86_sse41_ptestnzc: 7810 case Intrinsic::x86_avx_ptestz_256: 7811 case Intrinsic::x86_avx_ptestc_256: 7812 case Intrinsic::x86_avx_ptestnzc_256: 7813 case Intrinsic::x86_avx_vtestz_ps: 7814 case Intrinsic::x86_avx_vtestc_ps: 7815 case Intrinsic::x86_avx_vtestnzc_ps: 7816 case Intrinsic::x86_avx_vtestz_pd: 7817 case Intrinsic::x86_avx_vtestc_pd: 7818 case Intrinsic::x86_avx_vtestnzc_pd: 7819 case Intrinsic::x86_avx_vtestz_ps_256: 7820 case Intrinsic::x86_avx_vtestc_ps_256: 7821 case Intrinsic::x86_avx_vtestnzc_ps_256: 7822 case Intrinsic::x86_avx_vtestz_pd_256: 7823 case Intrinsic::x86_avx_vtestc_pd_256: 7824 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7825 bool IsTestPacked = false; 7826 unsigned X86CC = 0; 7827 switch (IntNo) { 7828 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7829 case Intrinsic::x86_avx_vtestz_ps: 7830 case Intrinsic::x86_avx_vtestz_pd: 7831 case Intrinsic::x86_avx_vtestz_ps_256: 7832 case Intrinsic::x86_avx_vtestz_pd_256: 7833 IsTestPacked = true; // Fallthrough 7834 case Intrinsic::x86_sse41_ptestz: 7835 case Intrinsic::x86_avx_ptestz_256: 7836 // ZF = 1 7837 X86CC = X86::COND_E; 7838 break; 7839 case Intrinsic::x86_avx_vtestc_ps: 7840 case Intrinsic::x86_avx_vtestc_pd: 7841 case Intrinsic::x86_avx_vtestc_ps_256: 7842 case Intrinsic::x86_avx_vtestc_pd_256: 7843 IsTestPacked = true; // Fallthrough 7844 case Intrinsic::x86_sse41_ptestc: 7845 case Intrinsic::x86_avx_ptestc_256: 7846 // CF = 1 7847 X86CC = X86::COND_B; 7848 break; 7849 case Intrinsic::x86_avx_vtestnzc_ps: 7850 case Intrinsic::x86_avx_vtestnzc_pd: 7851 case Intrinsic::x86_avx_vtestnzc_ps_256: 7852 case Intrinsic::x86_avx_vtestnzc_pd_256: 7853 IsTestPacked = true; // Fallthrough 7854 case Intrinsic::x86_sse41_ptestnzc: 7855 case Intrinsic::x86_avx_ptestnzc_256: 7856 // ZF and CF = 0 7857 X86CC = X86::COND_A; 7858 break; 7859 } 7860 7861 SDValue LHS = Op.getOperand(1); 7862 SDValue RHS = Op.getOperand(2); 7863 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7864 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7865 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7866 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7867 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7868 } 7869 7870 // Fix vector shift instructions where the last operand is a non-immediate 7871 // i32 value. 7872 case Intrinsic::x86_sse2_pslli_w: 7873 case Intrinsic::x86_sse2_pslli_d: 7874 case Intrinsic::x86_sse2_pslli_q: 7875 case Intrinsic::x86_sse2_psrli_w: 7876 case Intrinsic::x86_sse2_psrli_d: 7877 case Intrinsic::x86_sse2_psrli_q: 7878 case Intrinsic::x86_sse2_psrai_w: 7879 case Intrinsic::x86_sse2_psrai_d: 7880 case Intrinsic::x86_mmx_pslli_w: 7881 case Intrinsic::x86_mmx_pslli_d: 7882 case Intrinsic::x86_mmx_pslli_q: 7883 case Intrinsic::x86_mmx_psrli_w: 7884 case Intrinsic::x86_mmx_psrli_d: 7885 case Intrinsic::x86_mmx_psrli_q: 7886 case Intrinsic::x86_mmx_psrai_w: 7887 case Intrinsic::x86_mmx_psrai_d: { 7888 SDValue ShAmt = Op.getOperand(2); 7889 if (isa<ConstantSDNode>(ShAmt)) 7890 return SDValue(); 7891 7892 unsigned NewIntNo = 0; 7893 EVT ShAmtVT = MVT::v4i32; 7894 switch (IntNo) { 7895 case Intrinsic::x86_sse2_pslli_w: 7896 NewIntNo = Intrinsic::x86_sse2_psll_w; 7897 break; 7898 case Intrinsic::x86_sse2_pslli_d: 7899 NewIntNo = Intrinsic::x86_sse2_psll_d; 7900 break; 7901 case Intrinsic::x86_sse2_pslli_q: 7902 NewIntNo = Intrinsic::x86_sse2_psll_q; 7903 break; 7904 case Intrinsic::x86_sse2_psrli_w: 7905 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7906 break; 7907 case Intrinsic::x86_sse2_psrli_d: 7908 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7909 break; 7910 case Intrinsic::x86_sse2_psrli_q: 7911 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7912 break; 7913 case Intrinsic::x86_sse2_psrai_w: 7914 NewIntNo = Intrinsic::x86_sse2_psra_w; 7915 break; 7916 case Intrinsic::x86_sse2_psrai_d: 7917 NewIntNo = Intrinsic::x86_sse2_psra_d; 7918 break; 7919 default: { 7920 ShAmtVT = MVT::v2i32; 7921 switch (IntNo) { 7922 case Intrinsic::x86_mmx_pslli_w: 7923 NewIntNo = Intrinsic::x86_mmx_psll_w; 7924 break; 7925 case Intrinsic::x86_mmx_pslli_d: 7926 NewIntNo = Intrinsic::x86_mmx_psll_d; 7927 break; 7928 case Intrinsic::x86_mmx_pslli_q: 7929 NewIntNo = Intrinsic::x86_mmx_psll_q; 7930 break; 7931 case Intrinsic::x86_mmx_psrli_w: 7932 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7933 break; 7934 case Intrinsic::x86_mmx_psrli_d: 7935 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7936 break; 7937 case Intrinsic::x86_mmx_psrli_q: 7938 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7939 break; 7940 case Intrinsic::x86_mmx_psrai_w: 7941 NewIntNo = Intrinsic::x86_mmx_psra_w; 7942 break; 7943 case Intrinsic::x86_mmx_psrai_d: 7944 NewIntNo = Intrinsic::x86_mmx_psra_d; 7945 break; 7946 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7947 } 7948 break; 7949 } 7950 } 7951 7952 // The vector shift intrinsics with scalars uses 32b shift amounts but 7953 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7954 // to be zero. 7955 SDValue ShOps[4]; 7956 ShOps[0] = ShAmt; 7957 ShOps[1] = DAG.getConstant(0, MVT::i32); 7958 if (ShAmtVT == MVT::v4i32) { 7959 ShOps[2] = DAG.getUNDEF(MVT::i32); 7960 ShOps[3] = DAG.getUNDEF(MVT::i32); 7961 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7962 } else { 7963 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7964// FIXME this must be lowered to get rid of the invalid type. 7965 } 7966 7967 EVT VT = Op.getValueType(); 7968 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 7969 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7970 DAG.getConstant(NewIntNo, MVT::i32), 7971 Op.getOperand(1), ShAmt); 7972 } 7973 } 7974} 7975 7976SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7977 SelectionDAG &DAG) const { 7978 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7979 MFI->setReturnAddressIsTaken(true); 7980 7981 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7982 DebugLoc dl = Op.getDebugLoc(); 7983 7984 if (Depth > 0) { 7985 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7986 SDValue Offset = 7987 DAG.getConstant(TD->getPointerSize(), 7988 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7989 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7990 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7991 FrameAddr, Offset), 7992 MachinePointerInfo(), false, false, 0); 7993 } 7994 7995 // Just load the return address. 7996 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7997 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7998 RetAddrFI, MachinePointerInfo(), false, false, 0); 7999} 8000 8001SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 8002 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8003 MFI->setFrameAddressIsTaken(true); 8004 8005 EVT VT = Op.getValueType(); 8006 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 8007 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8008 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 8009 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 8010 while (Depth--) 8011 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 8012 MachinePointerInfo(), 8013 false, false, 0); 8014 return FrameAddr; 8015} 8016 8017SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 8018 SelectionDAG &DAG) const { 8019 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 8020} 8021 8022SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 8023 MachineFunction &MF = DAG.getMachineFunction(); 8024 SDValue Chain = Op.getOperand(0); 8025 SDValue Offset = Op.getOperand(1); 8026 SDValue Handler = Op.getOperand(2); 8027 DebugLoc dl = Op.getDebugLoc(); 8028 8029 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 8030 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 8031 getPointerTy()); 8032 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 8033 8034 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 8035 DAG.getIntPtrConstant(TD->getPointerSize())); 8036 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 8037 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 8038 false, false, 0); 8039 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 8040 MF.getRegInfo().addLiveOut(StoreAddrReg); 8041 8042 return DAG.getNode(X86ISD::EH_RETURN, dl, 8043 MVT::Other, 8044 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 8045} 8046 8047SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 8048 SelectionDAG &DAG) const { 8049 SDValue Root = Op.getOperand(0); 8050 SDValue Trmp = Op.getOperand(1); // trampoline 8051 SDValue FPtr = Op.getOperand(2); // nested function 8052 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 8053 DebugLoc dl = Op.getDebugLoc(); 8054 8055 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8056 8057 if (Subtarget->is64Bit()) { 8058 SDValue OutChains[6]; 8059 8060 // Large code-model. 8061 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 8062 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 8063 8064 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 8065 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 8066 8067 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 8068 8069 // Load the pointer to the nested function into R11. 8070 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 8071 SDValue Addr = Trmp; 8072 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8073 Addr, MachinePointerInfo(TrmpAddr), 8074 false, false, 0); 8075 8076 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8077 DAG.getConstant(2, MVT::i64)); 8078 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 8079 MachinePointerInfo(TrmpAddr, 2), 8080 false, false, 2); 8081 8082 // Load the 'nest' parameter value into R10. 8083 // R10 is specified in X86CallingConv.td 8084 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 8085 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8086 DAG.getConstant(10, MVT::i64)); 8087 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8088 Addr, MachinePointerInfo(TrmpAddr, 10), 8089 false, false, 0); 8090 8091 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8092 DAG.getConstant(12, MVT::i64)); 8093 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 8094 MachinePointerInfo(TrmpAddr, 12), 8095 false, false, 2); 8096 8097 // Jump to the nested function. 8098 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 8099 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8100 DAG.getConstant(20, MVT::i64)); 8101 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8102 Addr, MachinePointerInfo(TrmpAddr, 20), 8103 false, false, 0); 8104 8105 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 8106 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8107 DAG.getConstant(22, MVT::i64)); 8108 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 8109 MachinePointerInfo(TrmpAddr, 22), 8110 false, false, 0); 8111 8112 SDValue Ops[] = 8113 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 8114 return DAG.getMergeValues(Ops, 2, dl); 8115 } else { 8116 const Function *Func = 8117 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 8118 CallingConv::ID CC = Func->getCallingConv(); 8119 unsigned NestReg; 8120 8121 switch (CC) { 8122 default: 8123 llvm_unreachable("Unsupported calling convention"); 8124 case CallingConv::C: 8125 case CallingConv::X86_StdCall: { 8126 // Pass 'nest' parameter in ECX. 8127 // Must be kept in sync with X86CallingConv.td 8128 NestReg = X86::ECX; 8129 8130 // Check that ECX wasn't needed by an 'inreg' parameter. 8131 const FunctionType *FTy = Func->getFunctionType(); 8132 const AttrListPtr &Attrs = Func->getAttributes(); 8133 8134 if (!Attrs.isEmpty() && !Func->isVarArg()) { 8135 unsigned InRegCount = 0; 8136 unsigned Idx = 1; 8137 8138 for (FunctionType::param_iterator I = FTy->param_begin(), 8139 E = FTy->param_end(); I != E; ++I, ++Idx) 8140 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 8141 // FIXME: should only count parameters that are lowered to integers. 8142 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 8143 8144 if (InRegCount > 2) { 8145 report_fatal_error("Nest register in use - reduce number of inreg" 8146 " parameters!"); 8147 } 8148 } 8149 break; 8150 } 8151 case CallingConv::X86_FastCall: 8152 case CallingConv::X86_ThisCall: 8153 case CallingConv::Fast: 8154 // Pass 'nest' parameter in EAX. 8155 // Must be kept in sync with X86CallingConv.td 8156 NestReg = X86::EAX; 8157 break; 8158 } 8159 8160 SDValue OutChains[4]; 8161 SDValue Addr, Disp; 8162 8163 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8164 DAG.getConstant(10, MVT::i32)); 8165 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 8166 8167 // This is storing the opcode for MOV32ri. 8168 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 8169 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 8170 OutChains[0] = DAG.getStore(Root, dl, 8171 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 8172 Trmp, MachinePointerInfo(TrmpAddr), 8173 false, false, 0); 8174 8175 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8176 DAG.getConstant(1, MVT::i32)); 8177 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 8178 MachinePointerInfo(TrmpAddr, 1), 8179 false, false, 1); 8180 8181 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 8182 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8183 DAG.getConstant(5, MVT::i32)); 8184 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 8185 MachinePointerInfo(TrmpAddr, 5), 8186 false, false, 1); 8187 8188 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8189 DAG.getConstant(6, MVT::i32)); 8190 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 8191 MachinePointerInfo(TrmpAddr, 6), 8192 false, false, 1); 8193 8194 SDValue Ops[] = 8195 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 8196 return DAG.getMergeValues(Ops, 2, dl); 8197 } 8198} 8199 8200SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 8201 SelectionDAG &DAG) const { 8202 /* 8203 The rounding mode is in bits 11:10 of FPSR, and has the following 8204 settings: 8205 00 Round to nearest 8206 01 Round to -inf 8207 10 Round to +inf 8208 11 Round to 0 8209 8210 FLT_ROUNDS, on the other hand, expects the following: 8211 -1 Undefined 8212 0 Round to 0 8213 1 Round to nearest 8214 2 Round to +inf 8215 3 Round to -inf 8216 8217 To perform the conversion, we do: 8218 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 8219 */ 8220 8221 MachineFunction &MF = DAG.getMachineFunction(); 8222 const TargetMachine &TM = MF.getTarget(); 8223 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 8224 unsigned StackAlignment = TFI.getStackAlignment(); 8225 EVT VT = Op.getValueType(); 8226 DebugLoc DL = Op.getDebugLoc(); 8227 8228 // Save FP Control Word to stack slot 8229 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 8230 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8231 8232 8233 MachineMemOperand *MMO = 8234 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8235 MachineMemOperand::MOStore, 2, 2); 8236 8237 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 8238 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 8239 DAG.getVTList(MVT::Other), 8240 Ops, 2, MVT::i16, MMO); 8241 8242 // Load FP Control Word from stack slot 8243 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 8244 MachinePointerInfo(), false, false, 0); 8245 8246 // Transform as necessary 8247 SDValue CWD1 = 8248 DAG.getNode(ISD::SRL, DL, MVT::i16, 8249 DAG.getNode(ISD::AND, DL, MVT::i16, 8250 CWD, DAG.getConstant(0x800, MVT::i16)), 8251 DAG.getConstant(11, MVT::i8)); 8252 SDValue CWD2 = 8253 DAG.getNode(ISD::SRL, DL, MVT::i16, 8254 DAG.getNode(ISD::AND, DL, MVT::i16, 8255 CWD, DAG.getConstant(0x400, MVT::i16)), 8256 DAG.getConstant(9, MVT::i8)); 8257 8258 SDValue RetVal = 8259 DAG.getNode(ISD::AND, DL, MVT::i16, 8260 DAG.getNode(ISD::ADD, DL, MVT::i16, 8261 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 8262 DAG.getConstant(1, MVT::i16)), 8263 DAG.getConstant(3, MVT::i16)); 8264 8265 8266 return DAG.getNode((VT.getSizeInBits() < 16 ? 8267 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 8268} 8269 8270SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 8271 EVT VT = Op.getValueType(); 8272 EVT OpVT = VT; 8273 unsigned NumBits = VT.getSizeInBits(); 8274 DebugLoc dl = Op.getDebugLoc(); 8275 8276 Op = Op.getOperand(0); 8277 if (VT == MVT::i8) { 8278 // Zero extend to i32 since there is not an i8 bsr. 8279 OpVT = MVT::i32; 8280 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8281 } 8282 8283 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 8284 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8285 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 8286 8287 // If src is zero (i.e. bsr sets ZF), returns NumBits. 8288 SDValue Ops[] = { 8289 Op, 8290 DAG.getConstant(NumBits+NumBits-1, OpVT), 8291 DAG.getConstant(X86::COND_E, MVT::i8), 8292 Op.getValue(1) 8293 }; 8294 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8295 8296 // Finally xor with NumBits-1. 8297 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 8298 8299 if (VT == MVT::i8) 8300 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8301 return Op; 8302} 8303 8304SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 8305 EVT VT = Op.getValueType(); 8306 EVT OpVT = VT; 8307 unsigned NumBits = VT.getSizeInBits(); 8308 DebugLoc dl = Op.getDebugLoc(); 8309 8310 Op = Op.getOperand(0); 8311 if (VT == MVT::i8) { 8312 OpVT = MVT::i32; 8313 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8314 } 8315 8316 // Issue a bsf (scan bits forward) which also sets EFLAGS. 8317 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8318 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 8319 8320 // If src is zero (i.e. bsf sets ZF), returns NumBits. 8321 SDValue Ops[] = { 8322 Op, 8323 DAG.getConstant(NumBits, OpVT), 8324 DAG.getConstant(X86::COND_E, MVT::i8), 8325 Op.getValue(1) 8326 }; 8327 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8328 8329 if (VT == MVT::i8) 8330 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8331 return Op; 8332} 8333 8334SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 8335 EVT VT = Op.getValueType(); 8336 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 8337 DebugLoc dl = Op.getDebugLoc(); 8338 8339 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 8340 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 8341 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 8342 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 8343 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 8344 // 8345 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 8346 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 8347 // return AloBlo + AloBhi + AhiBlo; 8348 8349 SDValue A = Op.getOperand(0); 8350 SDValue B = Op.getOperand(1); 8351 8352 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8353 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8354 A, DAG.getConstant(32, MVT::i32)); 8355 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8356 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8357 B, DAG.getConstant(32, MVT::i32)); 8358 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8359 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8360 A, B); 8361 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8362 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8363 A, Bhi); 8364 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8365 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8366 Ahi, B); 8367 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8368 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8369 AloBhi, DAG.getConstant(32, MVT::i32)); 8370 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8371 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8372 AhiBlo, DAG.getConstant(32, MVT::i32)); 8373 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 8374 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 8375 return Res; 8376} 8377 8378SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 8379 EVT VT = Op.getValueType(); 8380 DebugLoc dl = Op.getDebugLoc(); 8381 SDValue R = Op.getOperand(0); 8382 8383 LLVMContext *Context = DAG.getContext(); 8384 8385 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 8386 8387 if (VT == MVT::v4i32) { 8388 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8389 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8390 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 8391 8392 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 8393 8394 std::vector<Constant*> CV(4, CI); 8395 Constant *C = ConstantVector::get(CV); 8396 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8397 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8398 MachinePointerInfo::getConstantPool(), 8399 false, false, 16); 8400 8401 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 8402 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 8403 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 8404 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 8405 } 8406 if (VT == MVT::v16i8) { 8407 // a = a << 5; 8408 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8409 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8410 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 8411 8412 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 8413 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 8414 8415 std::vector<Constant*> CVM1(16, CM1); 8416 std::vector<Constant*> CVM2(16, CM2); 8417 Constant *C = ConstantVector::get(CVM1); 8418 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8419 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8420 MachinePointerInfo::getConstantPool(), 8421 false, false, 16); 8422 8423 // r = pblendv(r, psllw(r & (char16)15, 4), a); 8424 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8425 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8426 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8427 DAG.getConstant(4, MVT::i32)); 8428 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 8429 // a += a 8430 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8431 8432 C = ConstantVector::get(CVM2); 8433 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8434 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8435 MachinePointerInfo::getConstantPool(), 8436 false, false, 16); 8437 8438 // r = pblendv(r, psllw(r & (char16)63, 2), a); 8439 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8440 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8441 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8442 DAG.getConstant(2, MVT::i32)); 8443 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 8444 // a += a 8445 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8446 8447 // return pblendv(r, r+r, a); 8448 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, 8449 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 8450 return R; 8451 } 8452 return SDValue(); 8453} 8454 8455SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 8456 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 8457 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 8458 // looks for this combo and may remove the "setcc" instruction if the "setcc" 8459 // has only one use. 8460 SDNode *N = Op.getNode(); 8461 SDValue LHS = N->getOperand(0); 8462 SDValue RHS = N->getOperand(1); 8463 unsigned BaseOp = 0; 8464 unsigned Cond = 0; 8465 DebugLoc DL = Op.getDebugLoc(); 8466 switch (Op.getOpcode()) { 8467 default: llvm_unreachable("Unknown ovf instruction!"); 8468 case ISD::SADDO: 8469 // A subtract of one will be selected as a INC. Note that INC doesn't 8470 // set CF, so we can't do this for UADDO. 8471 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8472 if (C->getAPIntValue() == 1) { 8473 BaseOp = X86ISD::INC; 8474 Cond = X86::COND_O; 8475 break; 8476 } 8477 BaseOp = X86ISD::ADD; 8478 Cond = X86::COND_O; 8479 break; 8480 case ISD::UADDO: 8481 BaseOp = X86ISD::ADD; 8482 Cond = X86::COND_B; 8483 break; 8484 case ISD::SSUBO: 8485 // A subtract of one will be selected as a DEC. Note that DEC doesn't 8486 // set CF, so we can't do this for USUBO. 8487 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8488 if (C->getAPIntValue() == 1) { 8489 BaseOp = X86ISD::DEC; 8490 Cond = X86::COND_O; 8491 break; 8492 } 8493 BaseOp = X86ISD::SUB; 8494 Cond = X86::COND_O; 8495 break; 8496 case ISD::USUBO: 8497 BaseOp = X86ISD::SUB; 8498 Cond = X86::COND_B; 8499 break; 8500 case ISD::SMULO: 8501 BaseOp = X86ISD::SMUL; 8502 Cond = X86::COND_O; 8503 break; 8504 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 8505 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 8506 MVT::i32); 8507 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 8508 8509 SDValue SetCC = 8510 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8511 DAG.getConstant(X86::COND_O, MVT::i32), 8512 SDValue(Sum.getNode(), 2)); 8513 8514 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8515 return Sum; 8516 } 8517 } 8518 8519 // Also sets EFLAGS. 8520 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 8521 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 8522 8523 SDValue SetCC = 8524 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 8525 DAG.getConstant(Cond, MVT::i32), 8526 SDValue(Sum.getNode(), 1)); 8527 8528 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8529 return Sum; 8530} 8531 8532SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 8533 DebugLoc dl = Op.getDebugLoc(); 8534 8535 if (!Subtarget->hasSSE2()) { 8536 SDValue Chain = Op.getOperand(0); 8537 SDValue Zero = DAG.getConstant(0, 8538 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8539 SDValue Ops[] = { 8540 DAG.getRegister(X86::ESP, MVT::i32), // Base 8541 DAG.getTargetConstant(1, MVT::i8), // Scale 8542 DAG.getRegister(0, MVT::i32), // Index 8543 DAG.getTargetConstant(0, MVT::i32), // Disp 8544 DAG.getRegister(0, MVT::i32), // Segment. 8545 Zero, 8546 Chain 8547 }; 8548 SDNode *Res = 8549 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 8550 array_lengthof(Ops)); 8551 return SDValue(Res, 0); 8552 } 8553 8554 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 8555 if (!isDev) 8556 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 8557 8558 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8559 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8560 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 8561 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 8562 8563 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 8564 if (!Op1 && !Op2 && !Op3 && Op4) 8565 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 8566 8567 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 8568 if (Op1 && !Op2 && !Op3 && !Op4) 8569 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 8570 8571 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 8572 // (MFENCE)>; 8573 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 8574} 8575 8576SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 8577 EVT T = Op.getValueType(); 8578 DebugLoc DL = Op.getDebugLoc(); 8579 unsigned Reg = 0; 8580 unsigned size = 0; 8581 switch(T.getSimpleVT().SimpleTy) { 8582 default: 8583 assert(false && "Invalid value type!"); 8584 case MVT::i8: Reg = X86::AL; size = 1; break; 8585 case MVT::i16: Reg = X86::AX; size = 2; break; 8586 case MVT::i32: Reg = X86::EAX; size = 4; break; 8587 case MVT::i64: 8588 assert(Subtarget->is64Bit() && "Node not type legal!"); 8589 Reg = X86::RAX; size = 8; 8590 break; 8591 } 8592 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 8593 Op.getOperand(2), SDValue()); 8594 SDValue Ops[] = { cpIn.getValue(0), 8595 Op.getOperand(1), 8596 Op.getOperand(3), 8597 DAG.getTargetConstant(size, MVT::i8), 8598 cpIn.getValue(1) }; 8599 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 8600 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 8601 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 8602 Ops, 5, T, MMO); 8603 SDValue cpOut = 8604 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 8605 return cpOut; 8606} 8607 8608SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 8609 SelectionDAG &DAG) const { 8610 assert(Subtarget->is64Bit() && "Result not type legalized?"); 8611 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 8612 SDValue TheChain = Op.getOperand(0); 8613 DebugLoc dl = Op.getDebugLoc(); 8614 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8615 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 8616 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 8617 rax.getValue(2)); 8618 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 8619 DAG.getConstant(32, MVT::i8)); 8620 SDValue Ops[] = { 8621 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 8622 rdx.getValue(1) 8623 }; 8624 return DAG.getMergeValues(Ops, 2, dl); 8625} 8626 8627SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 8628 SelectionDAG &DAG) const { 8629 EVT SrcVT = Op.getOperand(0).getValueType(); 8630 EVT DstVT = Op.getValueType(); 8631 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 8632 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 8633 assert((DstVT == MVT::i64 || 8634 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 8635 "Unexpected custom BITCAST"); 8636 // i64 <=> MMX conversions are Legal. 8637 if (SrcVT==MVT::i64 && DstVT.isVector()) 8638 return Op; 8639 if (DstVT==MVT::i64 && SrcVT.isVector()) 8640 return Op; 8641 // MMX <=> MMX conversions are Legal. 8642 if (SrcVT.isVector() && DstVT.isVector()) 8643 return Op; 8644 // All other conversions need to be expanded. 8645 return SDValue(); 8646} 8647 8648SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 8649 SDNode *Node = Op.getNode(); 8650 DebugLoc dl = Node->getDebugLoc(); 8651 EVT T = Node->getValueType(0); 8652 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 8653 DAG.getConstant(0, T), Node->getOperand(2)); 8654 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 8655 cast<AtomicSDNode>(Node)->getMemoryVT(), 8656 Node->getOperand(0), 8657 Node->getOperand(1), negOp, 8658 cast<AtomicSDNode>(Node)->getSrcValue(), 8659 cast<AtomicSDNode>(Node)->getAlignment()); 8660} 8661 8662static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 8663 EVT VT = Op.getNode()->getValueType(0); 8664 8665 // Let legalize expand this if it isn't a legal type yet. 8666 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 8667 return SDValue(); 8668 8669 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 8670 8671 unsigned Opc; 8672 bool ExtraOp = false; 8673 switch (Op.getOpcode()) { 8674 default: assert(0 && "Invalid code"); 8675 case ISD::ADDC: Opc = X86ISD::ADD; break; 8676 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 8677 case ISD::SUBC: Opc = X86ISD::SUB; break; 8678 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 8679 } 8680 8681 if (!ExtraOp) 8682 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 8683 Op.getOperand(1)); 8684 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 8685 Op.getOperand(1), Op.getOperand(2)); 8686} 8687 8688/// LowerOperation - Provide custom lowering hooks for some operations. 8689/// 8690SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 8691 switch (Op.getOpcode()) { 8692 default: llvm_unreachable("Should not custom lower this!"); 8693 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 8694 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 8695 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 8696 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 8697 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 8698 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8699 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8700 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8701 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 8702 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 8703 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 8704 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 8705 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 8706 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 8707 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 8708 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 8709 case ISD::SHL_PARTS: 8710 case ISD::SRA_PARTS: 8711 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 8712 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 8713 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 8714 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 8715 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 8716 case ISD::FABS: return LowerFABS(Op, DAG); 8717 case ISD::FNEG: return LowerFNEG(Op, DAG); 8718 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 8719 case ISD::SETCC: return LowerSETCC(Op, DAG); 8720 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 8721 case ISD::SELECT: return LowerSELECT(Op, DAG); 8722 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 8723 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 8724 case ISD::VASTART: return LowerVASTART(Op, DAG); 8725 case ISD::VAARG: return LowerVAARG(Op, DAG); 8726 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 8727 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 8728 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 8729 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 8730 case ISD::FRAME_TO_ARGS_OFFSET: 8731 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 8732 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 8733 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 8734 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 8735 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8736 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 8737 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 8738 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 8739 case ISD::SHL: return LowerSHL(Op, DAG); 8740 case ISD::SADDO: 8741 case ISD::UADDO: 8742 case ISD::SSUBO: 8743 case ISD::USUBO: 8744 case ISD::SMULO: 8745 case ISD::UMULO: return LowerXALUO(Op, DAG); 8746 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 8747 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 8748 case ISD::ADDC: 8749 case ISD::ADDE: 8750 case ISD::SUBC: 8751 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 8752 } 8753} 8754 8755void X86TargetLowering:: 8756ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 8757 SelectionDAG &DAG, unsigned NewOp) const { 8758 EVT T = Node->getValueType(0); 8759 DebugLoc dl = Node->getDebugLoc(); 8760 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 8761 8762 SDValue Chain = Node->getOperand(0); 8763 SDValue In1 = Node->getOperand(1); 8764 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8765 Node->getOperand(2), DAG.getIntPtrConstant(0)); 8766 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8767 Node->getOperand(2), DAG.getIntPtrConstant(1)); 8768 SDValue Ops[] = { Chain, In1, In2L, In2H }; 8769 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 8770 SDValue Result = 8771 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 8772 cast<MemSDNode>(Node)->getMemOperand()); 8773 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 8774 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8775 Results.push_back(Result.getValue(2)); 8776} 8777 8778/// ReplaceNodeResults - Replace a node with an illegal result type 8779/// with a new node built out of custom code. 8780void X86TargetLowering::ReplaceNodeResults(SDNode *N, 8781 SmallVectorImpl<SDValue>&Results, 8782 SelectionDAG &DAG) const { 8783 DebugLoc dl = N->getDebugLoc(); 8784 switch (N->getOpcode()) { 8785 default: 8786 assert(false && "Do not know how to custom type legalize this operation!"); 8787 return; 8788 case ISD::ADDC: 8789 case ISD::ADDE: 8790 case ISD::SUBC: 8791 case ISD::SUBE: 8792 // We don't want to expand or promote these. 8793 return; 8794 case ISD::FP_TO_SINT: { 8795 std::pair<SDValue,SDValue> Vals = 8796 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 8797 SDValue FIST = Vals.first, StackSlot = Vals.second; 8798 if (FIST.getNode() != 0) { 8799 EVT VT = N->getValueType(0); 8800 // Return a load from the stack slot. 8801 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 8802 MachinePointerInfo(), false, false, 0)); 8803 } 8804 return; 8805 } 8806 case ISD::READCYCLECOUNTER: { 8807 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 8808 SDValue TheChain = N->getOperand(0); 8809 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8810 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 8811 rd.getValue(1)); 8812 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 8813 eax.getValue(2)); 8814 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 8815 SDValue Ops[] = { eax, edx }; 8816 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 8817 Results.push_back(edx.getValue(1)); 8818 return; 8819 } 8820 case ISD::ATOMIC_CMP_SWAP: { 8821 EVT T = N->getValueType(0); 8822 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 8823 SDValue cpInL, cpInH; 8824 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8825 DAG.getConstant(0, MVT::i32)); 8826 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8827 DAG.getConstant(1, MVT::i32)); 8828 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 8829 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 8830 cpInL.getValue(1)); 8831 SDValue swapInL, swapInH; 8832 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8833 DAG.getConstant(0, MVT::i32)); 8834 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8835 DAG.getConstant(1, MVT::i32)); 8836 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 8837 cpInH.getValue(1)); 8838 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 8839 swapInL.getValue(1)); 8840 SDValue Ops[] = { swapInH.getValue(0), 8841 N->getOperand(1), 8842 swapInH.getValue(1) }; 8843 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 8844 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 8845 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 8846 Ops, 3, T, MMO); 8847 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 8848 MVT::i32, Result.getValue(1)); 8849 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 8850 MVT::i32, cpOutL.getValue(2)); 8851 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 8852 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8853 Results.push_back(cpOutH.getValue(1)); 8854 return; 8855 } 8856 case ISD::ATOMIC_LOAD_ADD: 8857 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 8858 return; 8859 case ISD::ATOMIC_LOAD_AND: 8860 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 8861 return; 8862 case ISD::ATOMIC_LOAD_NAND: 8863 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 8864 return; 8865 case ISD::ATOMIC_LOAD_OR: 8866 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 8867 return; 8868 case ISD::ATOMIC_LOAD_SUB: 8869 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8870 return; 8871 case ISD::ATOMIC_LOAD_XOR: 8872 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8873 return; 8874 case ISD::ATOMIC_SWAP: 8875 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8876 return; 8877 } 8878} 8879 8880const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8881 switch (Opcode) { 8882 default: return NULL; 8883 case X86ISD::BSF: return "X86ISD::BSF"; 8884 case X86ISD::BSR: return "X86ISD::BSR"; 8885 case X86ISD::SHLD: return "X86ISD::SHLD"; 8886 case X86ISD::SHRD: return "X86ISD::SHRD"; 8887 case X86ISD::FAND: return "X86ISD::FAND"; 8888 case X86ISD::FOR: return "X86ISD::FOR"; 8889 case X86ISD::FXOR: return "X86ISD::FXOR"; 8890 case X86ISD::FSRL: return "X86ISD::FSRL"; 8891 case X86ISD::FILD: return "X86ISD::FILD"; 8892 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8893 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8894 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8895 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8896 case X86ISD::FLD: return "X86ISD::FLD"; 8897 case X86ISD::FST: return "X86ISD::FST"; 8898 case X86ISD::CALL: return "X86ISD::CALL"; 8899 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8900 case X86ISD::BT: return "X86ISD::BT"; 8901 case X86ISD::CMP: return "X86ISD::CMP"; 8902 case X86ISD::COMI: return "X86ISD::COMI"; 8903 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8904 case X86ISD::SETCC: return "X86ISD::SETCC"; 8905 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8906 case X86ISD::CMOV: return "X86ISD::CMOV"; 8907 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8908 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8909 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8910 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8911 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8912 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8913 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8914 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8915 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8916 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8917 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8918 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8919 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8920 case X86ISD::PANDN: return "X86ISD::PANDN"; 8921 case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; 8922 case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; 8923 case X86ISD::PSIGND: return "X86ISD::PSIGND"; 8924 case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; 8925 case X86ISD::FMAX: return "X86ISD::FMAX"; 8926 case X86ISD::FMIN: return "X86ISD::FMIN"; 8927 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8928 case X86ISD::FRCP: return "X86ISD::FRCP"; 8929 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8930 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8931 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8932 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8933 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8934 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8935 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8936 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8937 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8938 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8939 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8940 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8941 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8942 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8943 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8944 case X86ISD::VSHL: return "X86ISD::VSHL"; 8945 case X86ISD::VSRL: return "X86ISD::VSRL"; 8946 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8947 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8948 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8949 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8950 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8951 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8952 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8953 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8954 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8955 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8956 case X86ISD::ADD: return "X86ISD::ADD"; 8957 case X86ISD::SUB: return "X86ISD::SUB"; 8958 case X86ISD::ADC: return "X86ISD::ADC"; 8959 case X86ISD::SBB: return "X86ISD::SBB"; 8960 case X86ISD::SMUL: return "X86ISD::SMUL"; 8961 case X86ISD::UMUL: return "X86ISD::UMUL"; 8962 case X86ISD::INC: return "X86ISD::INC"; 8963 case X86ISD::DEC: return "X86ISD::DEC"; 8964 case X86ISD::OR: return "X86ISD::OR"; 8965 case X86ISD::XOR: return "X86ISD::XOR"; 8966 case X86ISD::AND: return "X86ISD::AND"; 8967 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8968 case X86ISD::PTEST: return "X86ISD::PTEST"; 8969 case X86ISD::TESTP: return "X86ISD::TESTP"; 8970 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 8971 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 8972 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 8973 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 8974 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 8975 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 8976 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 8977 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 8978 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 8979 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 8980 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 8981 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 8982 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 8983 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 8984 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 8985 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 8986 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 8987 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 8988 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 8989 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 8990 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 8991 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 8992 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 8993 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 8994 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 8995 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 8996 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 8997 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 8998 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 8999 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 9000 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 9001 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 9002 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 9003 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 9004 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 9005 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 9006 } 9007} 9008 9009// isLegalAddressingMode - Return true if the addressing mode represented 9010// by AM is legal for this target, for a load/store of the specified type. 9011bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 9012 const Type *Ty) const { 9013 // X86 supports extremely general addressing modes. 9014 CodeModel::Model M = getTargetMachine().getCodeModel(); 9015 Reloc::Model R = getTargetMachine().getRelocationModel(); 9016 9017 // X86 allows a sign-extended 32-bit immediate field as a displacement. 9018 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 9019 return false; 9020 9021 if (AM.BaseGV) { 9022 unsigned GVFlags = 9023 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 9024 9025 // If a reference to this global requires an extra load, we can't fold it. 9026 if (isGlobalStubReference(GVFlags)) 9027 return false; 9028 9029 // If BaseGV requires a register for the PIC base, we cannot also have a 9030 // BaseReg specified. 9031 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 9032 return false; 9033 9034 // If lower 4G is not available, then we must use rip-relative addressing. 9035 if ((M != CodeModel::Small || R != Reloc::Static) && 9036 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 9037 return false; 9038 } 9039 9040 switch (AM.Scale) { 9041 case 0: 9042 case 1: 9043 case 2: 9044 case 4: 9045 case 8: 9046 // These scales always work. 9047 break; 9048 case 3: 9049 case 5: 9050 case 9: 9051 // These scales are formed with basereg+scalereg. Only accept if there is 9052 // no basereg yet. 9053 if (AM.HasBaseReg) 9054 return false; 9055 break; 9056 default: // Other stuff never works. 9057 return false; 9058 } 9059 9060 return true; 9061} 9062 9063 9064bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 9065 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 9066 return false; 9067 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 9068 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 9069 if (NumBits1 <= NumBits2) 9070 return false; 9071 return true; 9072} 9073 9074bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 9075 if (!VT1.isInteger() || !VT2.isInteger()) 9076 return false; 9077 unsigned NumBits1 = VT1.getSizeInBits(); 9078 unsigned NumBits2 = VT2.getSizeInBits(); 9079 if (NumBits1 <= NumBits2) 9080 return false; 9081 return true; 9082} 9083 9084bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 9085 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9086 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 9087} 9088 9089bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 9090 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9091 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 9092} 9093 9094bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 9095 // i16 instructions are longer (0x66 prefix) and potentially slower. 9096 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 9097} 9098 9099/// isShuffleMaskLegal - Targets can use this to indicate that they only 9100/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 9101/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 9102/// are assumed to be legal. 9103bool 9104X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 9105 EVT VT) const { 9106 // Very little shuffling can be done for 64-bit vectors right now. 9107 if (VT.getSizeInBits() == 64) 9108 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 9109 9110 // FIXME: pshufb, blends, shifts. 9111 return (VT.getVectorNumElements() == 2 || 9112 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 9113 isMOVLMask(M, VT) || 9114 isSHUFPMask(M, VT) || 9115 isPSHUFDMask(M, VT) || 9116 isPSHUFHWMask(M, VT) || 9117 isPSHUFLWMask(M, VT) || 9118 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 9119 isUNPCKLMask(M, VT) || 9120 isUNPCKHMask(M, VT) || 9121 isUNPCKL_v_undef_Mask(M, VT) || 9122 isUNPCKH_v_undef_Mask(M, VT)); 9123} 9124 9125bool 9126X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 9127 EVT VT) const { 9128 unsigned NumElts = VT.getVectorNumElements(); 9129 // FIXME: This collection of masks seems suspect. 9130 if (NumElts == 2) 9131 return true; 9132 if (NumElts == 4 && VT.getSizeInBits() == 128) { 9133 return (isMOVLMask(Mask, VT) || 9134 isCommutedMOVLMask(Mask, VT, true) || 9135 isSHUFPMask(Mask, VT) || 9136 isCommutedSHUFPMask(Mask, VT)); 9137 } 9138 return false; 9139} 9140 9141//===----------------------------------------------------------------------===// 9142// X86 Scheduler Hooks 9143//===----------------------------------------------------------------------===// 9144 9145// private utility function 9146MachineBasicBlock * 9147X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 9148 MachineBasicBlock *MBB, 9149 unsigned regOpc, 9150 unsigned immOpc, 9151 unsigned LoadOpc, 9152 unsigned CXchgOpc, 9153 unsigned notOpc, 9154 unsigned EAXreg, 9155 TargetRegisterClass *RC, 9156 bool invSrc) const { 9157 // For the atomic bitwise operator, we generate 9158 // thisMBB: 9159 // newMBB: 9160 // ld t1 = [bitinstr.addr] 9161 // op t2 = t1, [bitinstr.val] 9162 // mov EAX = t1 9163 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9164 // bz newMBB 9165 // fallthrough -->nextMBB 9166 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9167 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9168 MachineFunction::iterator MBBIter = MBB; 9169 ++MBBIter; 9170 9171 /// First build the CFG 9172 MachineFunction *F = MBB->getParent(); 9173 MachineBasicBlock *thisMBB = MBB; 9174 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9175 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9176 F->insert(MBBIter, newMBB); 9177 F->insert(MBBIter, nextMBB); 9178 9179 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9180 nextMBB->splice(nextMBB->begin(), thisMBB, 9181 llvm::next(MachineBasicBlock::iterator(bInstr)), 9182 thisMBB->end()); 9183 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9184 9185 // Update thisMBB to fall through to newMBB 9186 thisMBB->addSuccessor(newMBB); 9187 9188 // newMBB jumps to itself and fall through to nextMBB 9189 newMBB->addSuccessor(nextMBB); 9190 newMBB->addSuccessor(newMBB); 9191 9192 // Insert instructions into newMBB based on incoming instruction 9193 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9194 "unexpected number of operands"); 9195 DebugLoc dl = bInstr->getDebugLoc(); 9196 MachineOperand& destOper = bInstr->getOperand(0); 9197 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9198 int numArgs = bInstr->getNumOperands() - 1; 9199 for (int i=0; i < numArgs; ++i) 9200 argOpers[i] = &bInstr->getOperand(i+1); 9201 9202 // x86 address has 4 operands: base, index, scale, and displacement 9203 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9204 int valArgIndx = lastAddrIndx + 1; 9205 9206 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9207 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 9208 for (int i=0; i <= lastAddrIndx; ++i) 9209 (*MIB).addOperand(*argOpers[i]); 9210 9211 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 9212 if (invSrc) { 9213 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 9214 } 9215 else 9216 tt = t1; 9217 9218 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9219 assert((argOpers[valArgIndx]->isReg() || 9220 argOpers[valArgIndx]->isImm()) && 9221 "invalid operand"); 9222 if (argOpers[valArgIndx]->isReg()) 9223 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 9224 else 9225 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 9226 MIB.addReg(tt); 9227 (*MIB).addOperand(*argOpers[valArgIndx]); 9228 9229 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 9230 MIB.addReg(t1); 9231 9232 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 9233 for (int i=0; i <= lastAddrIndx; ++i) 9234 (*MIB).addOperand(*argOpers[i]); 9235 MIB.addReg(t2); 9236 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9237 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9238 bInstr->memoperands_end()); 9239 9240 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9241 MIB.addReg(EAXreg); 9242 9243 // insert branch 9244 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9245 9246 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9247 return nextMBB; 9248} 9249 9250// private utility function: 64 bit atomics on 32 bit host. 9251MachineBasicBlock * 9252X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 9253 MachineBasicBlock *MBB, 9254 unsigned regOpcL, 9255 unsigned regOpcH, 9256 unsigned immOpcL, 9257 unsigned immOpcH, 9258 bool invSrc) const { 9259 // For the atomic bitwise operator, we generate 9260 // thisMBB (instructions are in pairs, except cmpxchg8b) 9261 // ld t1,t2 = [bitinstr.addr] 9262 // newMBB: 9263 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 9264 // op t5, t6 <- out1, out2, [bitinstr.val] 9265 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 9266 // mov ECX, EBX <- t5, t6 9267 // mov EAX, EDX <- t1, t2 9268 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 9269 // mov t3, t4 <- EAX, EDX 9270 // bz newMBB 9271 // result in out1, out2 9272 // fallthrough -->nextMBB 9273 9274 const TargetRegisterClass *RC = X86::GR32RegisterClass; 9275 const unsigned LoadOpc = X86::MOV32rm; 9276 const unsigned NotOpc = X86::NOT32r; 9277 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9278 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9279 MachineFunction::iterator MBBIter = MBB; 9280 ++MBBIter; 9281 9282 /// First build the CFG 9283 MachineFunction *F = MBB->getParent(); 9284 MachineBasicBlock *thisMBB = MBB; 9285 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9286 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9287 F->insert(MBBIter, newMBB); 9288 F->insert(MBBIter, nextMBB); 9289 9290 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9291 nextMBB->splice(nextMBB->begin(), thisMBB, 9292 llvm::next(MachineBasicBlock::iterator(bInstr)), 9293 thisMBB->end()); 9294 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9295 9296 // Update thisMBB to fall through to newMBB 9297 thisMBB->addSuccessor(newMBB); 9298 9299 // newMBB jumps to itself and fall through to nextMBB 9300 newMBB->addSuccessor(nextMBB); 9301 newMBB->addSuccessor(newMBB); 9302 9303 DebugLoc dl = bInstr->getDebugLoc(); 9304 // Insert instructions into newMBB based on incoming instruction 9305 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 9306 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 9307 "unexpected number of operands"); 9308 MachineOperand& dest1Oper = bInstr->getOperand(0); 9309 MachineOperand& dest2Oper = bInstr->getOperand(1); 9310 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9311 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 9312 argOpers[i] = &bInstr->getOperand(i+2); 9313 9314 // We use some of the operands multiple times, so conservatively just 9315 // clear any kill flags that might be present. 9316 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 9317 argOpers[i]->setIsKill(false); 9318 } 9319 9320 // x86 address has 5 operands: base, index, scale, displacement, and segment. 9321 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9322 9323 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9324 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 9325 for (int i=0; i <= lastAddrIndx; ++i) 9326 (*MIB).addOperand(*argOpers[i]); 9327 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9328 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 9329 // add 4 to displacement. 9330 for (int i=0; i <= lastAddrIndx-2; ++i) 9331 (*MIB).addOperand(*argOpers[i]); 9332 MachineOperand newOp3 = *(argOpers[3]); 9333 if (newOp3.isImm()) 9334 newOp3.setImm(newOp3.getImm()+4); 9335 else 9336 newOp3.setOffset(newOp3.getOffset()+4); 9337 (*MIB).addOperand(newOp3); 9338 (*MIB).addOperand(*argOpers[lastAddrIndx]); 9339 9340 // t3/4 are defined later, at the bottom of the loop 9341 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 9342 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 9343 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 9344 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 9345 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 9346 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 9347 9348 // The subsequent operations should be using the destination registers of 9349 //the PHI instructions. 9350 if (invSrc) { 9351 t1 = F->getRegInfo().createVirtualRegister(RC); 9352 t2 = F->getRegInfo().createVirtualRegister(RC); 9353 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 9354 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 9355 } else { 9356 t1 = dest1Oper.getReg(); 9357 t2 = dest2Oper.getReg(); 9358 } 9359 9360 int valArgIndx = lastAddrIndx + 1; 9361 assert((argOpers[valArgIndx]->isReg() || 9362 argOpers[valArgIndx]->isImm()) && 9363 "invalid operand"); 9364 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 9365 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 9366 if (argOpers[valArgIndx]->isReg()) 9367 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 9368 else 9369 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 9370 if (regOpcL != X86::MOV32rr) 9371 MIB.addReg(t1); 9372 (*MIB).addOperand(*argOpers[valArgIndx]); 9373 assert(argOpers[valArgIndx + 1]->isReg() == 9374 argOpers[valArgIndx]->isReg()); 9375 assert(argOpers[valArgIndx + 1]->isImm() == 9376 argOpers[valArgIndx]->isImm()); 9377 if (argOpers[valArgIndx + 1]->isReg()) 9378 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 9379 else 9380 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 9381 if (regOpcH != X86::MOV32rr) 9382 MIB.addReg(t2); 9383 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 9384 9385 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9386 MIB.addReg(t1); 9387 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 9388 MIB.addReg(t2); 9389 9390 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 9391 MIB.addReg(t5); 9392 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 9393 MIB.addReg(t6); 9394 9395 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 9396 for (int i=0; i <= lastAddrIndx; ++i) 9397 (*MIB).addOperand(*argOpers[i]); 9398 9399 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9400 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9401 bInstr->memoperands_end()); 9402 9403 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 9404 MIB.addReg(X86::EAX); 9405 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 9406 MIB.addReg(X86::EDX); 9407 9408 // insert branch 9409 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9410 9411 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9412 return nextMBB; 9413} 9414 9415// private utility function 9416MachineBasicBlock * 9417X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 9418 MachineBasicBlock *MBB, 9419 unsigned cmovOpc) const { 9420 // For the atomic min/max operator, we generate 9421 // thisMBB: 9422 // newMBB: 9423 // ld t1 = [min/max.addr] 9424 // mov t2 = [min/max.val] 9425 // cmp t1, t2 9426 // cmov[cond] t2 = t1 9427 // mov EAX = t1 9428 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9429 // bz newMBB 9430 // fallthrough -->nextMBB 9431 // 9432 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9433 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9434 MachineFunction::iterator MBBIter = MBB; 9435 ++MBBIter; 9436 9437 /// First build the CFG 9438 MachineFunction *F = MBB->getParent(); 9439 MachineBasicBlock *thisMBB = MBB; 9440 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9441 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9442 F->insert(MBBIter, newMBB); 9443 F->insert(MBBIter, nextMBB); 9444 9445 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9446 nextMBB->splice(nextMBB->begin(), thisMBB, 9447 llvm::next(MachineBasicBlock::iterator(mInstr)), 9448 thisMBB->end()); 9449 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9450 9451 // Update thisMBB to fall through to newMBB 9452 thisMBB->addSuccessor(newMBB); 9453 9454 // newMBB jumps to newMBB and fall through to nextMBB 9455 newMBB->addSuccessor(nextMBB); 9456 newMBB->addSuccessor(newMBB); 9457 9458 DebugLoc dl = mInstr->getDebugLoc(); 9459 // Insert instructions into newMBB based on incoming instruction 9460 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9461 "unexpected number of operands"); 9462 MachineOperand& destOper = mInstr->getOperand(0); 9463 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9464 int numArgs = mInstr->getNumOperands() - 1; 9465 for (int i=0; i < numArgs; ++i) 9466 argOpers[i] = &mInstr->getOperand(i+1); 9467 9468 // x86 address has 4 operands: base, index, scale, and displacement 9469 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9470 int valArgIndx = lastAddrIndx + 1; 9471 9472 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9473 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 9474 for (int i=0; i <= lastAddrIndx; ++i) 9475 (*MIB).addOperand(*argOpers[i]); 9476 9477 // We only support register and immediate values 9478 assert((argOpers[valArgIndx]->isReg() || 9479 argOpers[valArgIndx]->isImm()) && 9480 "invalid operand"); 9481 9482 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9483 if (argOpers[valArgIndx]->isReg()) 9484 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 9485 else 9486 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 9487 (*MIB).addOperand(*argOpers[valArgIndx]); 9488 9489 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9490 MIB.addReg(t1); 9491 9492 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 9493 MIB.addReg(t1); 9494 MIB.addReg(t2); 9495 9496 // Generate movc 9497 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9498 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 9499 MIB.addReg(t2); 9500 MIB.addReg(t1); 9501 9502 // Cmp and exchange if none has modified the memory location 9503 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 9504 for (int i=0; i <= lastAddrIndx; ++i) 9505 (*MIB).addOperand(*argOpers[i]); 9506 MIB.addReg(t3); 9507 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9508 (*MIB).setMemRefs(mInstr->memoperands_begin(), 9509 mInstr->memoperands_end()); 9510 9511 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9512 MIB.addReg(X86::EAX); 9513 9514 // insert branch 9515 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9516 9517 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 9518 return nextMBB; 9519} 9520 9521// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 9522// or XMM0_V32I8 in AVX all of this code can be replaced with that 9523// in the .td file. 9524MachineBasicBlock * 9525X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 9526 unsigned numArgs, bool memArg) const { 9527 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 9528 "Target must have SSE4.2 or AVX features enabled"); 9529 9530 DebugLoc dl = MI->getDebugLoc(); 9531 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9532 unsigned Opc; 9533 if (!Subtarget->hasAVX()) { 9534 if (memArg) 9535 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 9536 else 9537 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 9538 } else { 9539 if (memArg) 9540 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 9541 else 9542 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 9543 } 9544 9545 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 9546 for (unsigned i = 0; i < numArgs; ++i) { 9547 MachineOperand &Op = MI->getOperand(i+1); 9548 if (!(Op.isReg() && Op.isImplicit())) 9549 MIB.addOperand(Op); 9550 } 9551 BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 9552 .addReg(X86::XMM0); 9553 9554 MI->eraseFromParent(); 9555 return BB; 9556} 9557 9558MachineBasicBlock * 9559X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 9560 DebugLoc dl = MI->getDebugLoc(); 9561 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9562 9563 // Address into RAX/EAX, other two args into ECX, EDX. 9564 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 9565 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 9566 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 9567 for (int i = 0; i < X86::AddrNumOperands; ++i) 9568 MIB.addOperand(MI->getOperand(i)); 9569 9570 unsigned ValOps = X86::AddrNumOperands; 9571 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 9572 .addReg(MI->getOperand(ValOps).getReg()); 9573 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 9574 .addReg(MI->getOperand(ValOps+1).getReg()); 9575 9576 // The instruction doesn't actually take any operands though. 9577 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 9578 9579 MI->eraseFromParent(); // The pseudo is gone now. 9580 return BB; 9581} 9582 9583MachineBasicBlock * 9584X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 9585 DebugLoc dl = MI->getDebugLoc(); 9586 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9587 9588 // First arg in ECX, the second in EAX. 9589 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 9590 .addReg(MI->getOperand(0).getReg()); 9591 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 9592 .addReg(MI->getOperand(1).getReg()); 9593 9594 // The instruction doesn't actually take any operands though. 9595 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 9596 9597 MI->eraseFromParent(); // The pseudo is gone now. 9598 return BB; 9599} 9600 9601MachineBasicBlock * 9602X86TargetLowering::EmitVAARG64WithCustomInserter( 9603 MachineInstr *MI, 9604 MachineBasicBlock *MBB) const { 9605 // Emit va_arg instruction on X86-64. 9606 9607 // Operands to this pseudo-instruction: 9608 // 0 ) Output : destination address (reg) 9609 // 1-5) Input : va_list address (addr, i64mem) 9610 // 6 ) ArgSize : Size (in bytes) of vararg type 9611 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 9612 // 8 ) Align : Alignment of type 9613 // 9 ) EFLAGS (implicit-def) 9614 9615 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 9616 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 9617 9618 unsigned DestReg = MI->getOperand(0).getReg(); 9619 MachineOperand &Base = MI->getOperand(1); 9620 MachineOperand &Scale = MI->getOperand(2); 9621 MachineOperand &Index = MI->getOperand(3); 9622 MachineOperand &Disp = MI->getOperand(4); 9623 MachineOperand &Segment = MI->getOperand(5); 9624 unsigned ArgSize = MI->getOperand(6).getImm(); 9625 unsigned ArgMode = MI->getOperand(7).getImm(); 9626 unsigned Align = MI->getOperand(8).getImm(); 9627 9628 // Memory Reference 9629 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 9630 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 9631 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 9632 9633 // Machine Information 9634 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9635 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 9636 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 9637 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 9638 DebugLoc DL = MI->getDebugLoc(); 9639 9640 // struct va_list { 9641 // i32 gp_offset 9642 // i32 fp_offset 9643 // i64 overflow_area (address) 9644 // i64 reg_save_area (address) 9645 // } 9646 // sizeof(va_list) = 24 9647 // alignment(va_list) = 8 9648 9649 unsigned TotalNumIntRegs = 6; 9650 unsigned TotalNumXMMRegs = 8; 9651 bool UseGPOffset = (ArgMode == 1); 9652 bool UseFPOffset = (ArgMode == 2); 9653 unsigned MaxOffset = TotalNumIntRegs * 8 + 9654 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 9655 9656 /* Align ArgSize to a multiple of 8 */ 9657 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 9658 bool NeedsAlign = (Align > 8); 9659 9660 MachineBasicBlock *thisMBB = MBB; 9661 MachineBasicBlock *overflowMBB; 9662 MachineBasicBlock *offsetMBB; 9663 MachineBasicBlock *endMBB; 9664 9665 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 9666 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 9667 unsigned OffsetReg = 0; 9668 9669 if (!UseGPOffset && !UseFPOffset) { 9670 // If we only pull from the overflow region, we don't create a branch. 9671 // We don't need to alter control flow. 9672 OffsetDestReg = 0; // unused 9673 OverflowDestReg = DestReg; 9674 9675 offsetMBB = NULL; 9676 overflowMBB = thisMBB; 9677 endMBB = thisMBB; 9678 } else { 9679 // First emit code to check if gp_offset (or fp_offset) is below the bound. 9680 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 9681 // If not, pull from overflow_area. (branch to overflowMBB) 9682 // 9683 // thisMBB 9684 // | . 9685 // | . 9686 // offsetMBB overflowMBB 9687 // | . 9688 // | . 9689 // endMBB 9690 9691 // Registers for the PHI in endMBB 9692 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 9693 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 9694 9695 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9696 MachineFunction *MF = MBB->getParent(); 9697 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9698 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9699 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9700 9701 MachineFunction::iterator MBBIter = MBB; 9702 ++MBBIter; 9703 9704 // Insert the new basic blocks 9705 MF->insert(MBBIter, offsetMBB); 9706 MF->insert(MBBIter, overflowMBB); 9707 MF->insert(MBBIter, endMBB); 9708 9709 // Transfer the remainder of MBB and its successor edges to endMBB. 9710 endMBB->splice(endMBB->begin(), thisMBB, 9711 llvm::next(MachineBasicBlock::iterator(MI)), 9712 thisMBB->end()); 9713 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9714 9715 // Make offsetMBB and overflowMBB successors of thisMBB 9716 thisMBB->addSuccessor(offsetMBB); 9717 thisMBB->addSuccessor(overflowMBB); 9718 9719 // endMBB is a successor of both offsetMBB and overflowMBB 9720 offsetMBB->addSuccessor(endMBB); 9721 overflowMBB->addSuccessor(endMBB); 9722 9723 // Load the offset value into a register 9724 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 9725 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 9726 .addOperand(Base) 9727 .addOperand(Scale) 9728 .addOperand(Index) 9729 .addDisp(Disp, UseFPOffset ? 4 : 0) 9730 .addOperand(Segment) 9731 .setMemRefs(MMOBegin, MMOEnd); 9732 9733 // Check if there is enough room left to pull this argument. 9734 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 9735 .addReg(OffsetReg) 9736 .addImm(MaxOffset + 8 - ArgSizeA8); 9737 9738 // Branch to "overflowMBB" if offset >= max 9739 // Fall through to "offsetMBB" otherwise 9740 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 9741 .addMBB(overflowMBB); 9742 } 9743 9744 // In offsetMBB, emit code to use the reg_save_area. 9745 if (offsetMBB) { 9746 assert(OffsetReg != 0); 9747 9748 // Read the reg_save_area address. 9749 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 9750 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 9751 .addOperand(Base) 9752 .addOperand(Scale) 9753 .addOperand(Index) 9754 .addDisp(Disp, 16) 9755 .addOperand(Segment) 9756 .setMemRefs(MMOBegin, MMOEnd); 9757 9758 // Zero-extend the offset 9759 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 9760 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 9761 .addImm(0) 9762 .addReg(OffsetReg) 9763 .addImm(X86::sub_32bit); 9764 9765 // Add the offset to the reg_save_area to get the final address. 9766 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 9767 .addReg(OffsetReg64) 9768 .addReg(RegSaveReg); 9769 9770 // Compute the offset for the next argument 9771 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 9772 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 9773 .addReg(OffsetReg) 9774 .addImm(UseFPOffset ? 16 : 8); 9775 9776 // Store it back into the va_list. 9777 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 9778 .addOperand(Base) 9779 .addOperand(Scale) 9780 .addOperand(Index) 9781 .addDisp(Disp, UseFPOffset ? 4 : 0) 9782 .addOperand(Segment) 9783 .addReg(NextOffsetReg) 9784 .setMemRefs(MMOBegin, MMOEnd); 9785 9786 // Jump to endMBB 9787 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 9788 .addMBB(endMBB); 9789 } 9790 9791 // 9792 // Emit code to use overflow area 9793 // 9794 9795 // Load the overflow_area address into a register. 9796 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 9797 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 9798 .addOperand(Base) 9799 .addOperand(Scale) 9800 .addOperand(Index) 9801 .addDisp(Disp, 8) 9802 .addOperand(Segment) 9803 .setMemRefs(MMOBegin, MMOEnd); 9804 9805 // If we need to align it, do so. Otherwise, just copy the address 9806 // to OverflowDestReg. 9807 if (NeedsAlign) { 9808 // Align the overflow address 9809 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 9810 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 9811 9812 // aligned_addr = (addr + (align-1)) & ~(align-1) 9813 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 9814 .addReg(OverflowAddrReg) 9815 .addImm(Align-1); 9816 9817 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 9818 .addReg(TmpReg) 9819 .addImm(~(uint64_t)(Align-1)); 9820 } else { 9821 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 9822 .addReg(OverflowAddrReg); 9823 } 9824 9825 // Compute the next overflow address after this argument. 9826 // (the overflow address should be kept 8-byte aligned) 9827 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 9828 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 9829 .addReg(OverflowDestReg) 9830 .addImm(ArgSizeA8); 9831 9832 // Store the new overflow address. 9833 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 9834 .addOperand(Base) 9835 .addOperand(Scale) 9836 .addOperand(Index) 9837 .addDisp(Disp, 8) 9838 .addOperand(Segment) 9839 .addReg(NextAddrReg) 9840 .setMemRefs(MMOBegin, MMOEnd); 9841 9842 // If we branched, emit the PHI to the front of endMBB. 9843 if (offsetMBB) { 9844 BuildMI(*endMBB, endMBB->begin(), DL, 9845 TII->get(X86::PHI), DestReg) 9846 .addReg(OffsetDestReg).addMBB(offsetMBB) 9847 .addReg(OverflowDestReg).addMBB(overflowMBB); 9848 } 9849 9850 // Erase the pseudo instruction 9851 MI->eraseFromParent(); 9852 9853 return endMBB; 9854} 9855 9856MachineBasicBlock * 9857X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 9858 MachineInstr *MI, 9859 MachineBasicBlock *MBB) const { 9860 // Emit code to save XMM registers to the stack. The ABI says that the 9861 // number of registers to save is given in %al, so it's theoretically 9862 // possible to do an indirect jump trick to avoid saving all of them, 9863 // however this code takes a simpler approach and just executes all 9864 // of the stores if %al is non-zero. It's less code, and it's probably 9865 // easier on the hardware branch predictor, and stores aren't all that 9866 // expensive anyway. 9867 9868 // Create the new basic blocks. One block contains all the XMM stores, 9869 // and one block is the final destination regardless of whether any 9870 // stores were performed. 9871 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9872 MachineFunction *F = MBB->getParent(); 9873 MachineFunction::iterator MBBIter = MBB; 9874 ++MBBIter; 9875 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 9876 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 9877 F->insert(MBBIter, XMMSaveMBB); 9878 F->insert(MBBIter, EndMBB); 9879 9880 // Transfer the remainder of MBB and its successor edges to EndMBB. 9881 EndMBB->splice(EndMBB->begin(), MBB, 9882 llvm::next(MachineBasicBlock::iterator(MI)), 9883 MBB->end()); 9884 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 9885 9886 // The original block will now fall through to the XMM save block. 9887 MBB->addSuccessor(XMMSaveMBB); 9888 // The XMMSaveMBB will fall through to the end block. 9889 XMMSaveMBB->addSuccessor(EndMBB); 9890 9891 // Now add the instructions. 9892 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9893 DebugLoc DL = MI->getDebugLoc(); 9894 9895 unsigned CountReg = MI->getOperand(0).getReg(); 9896 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 9897 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 9898 9899 if (!Subtarget->isTargetWin64()) { 9900 // If %al is 0, branch around the XMM save block. 9901 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 9902 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 9903 MBB->addSuccessor(EndMBB); 9904 } 9905 9906 // In the XMM save block, save all the XMM argument registers. 9907 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 9908 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 9909 MachineMemOperand *MMO = 9910 F->getMachineMemOperand( 9911 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 9912 MachineMemOperand::MOStore, 9913 /*Size=*/16, /*Align=*/16); 9914 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 9915 .addFrameIndex(RegSaveFrameIndex) 9916 .addImm(/*Scale=*/1) 9917 .addReg(/*IndexReg=*/0) 9918 .addImm(/*Disp=*/Offset) 9919 .addReg(/*Segment=*/0) 9920 .addReg(MI->getOperand(i).getReg()) 9921 .addMemOperand(MMO); 9922 } 9923 9924 MI->eraseFromParent(); // The pseudo instruction is gone now. 9925 9926 return EndMBB; 9927} 9928 9929MachineBasicBlock * 9930X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 9931 MachineBasicBlock *BB) const { 9932 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9933 DebugLoc DL = MI->getDebugLoc(); 9934 9935 // To "insert" a SELECT_CC instruction, we actually have to insert the 9936 // diamond control-flow pattern. The incoming instruction knows the 9937 // destination vreg to set, the condition code register to branch on, the 9938 // true/false values to select between, and a branch opcode to use. 9939 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9940 MachineFunction::iterator It = BB; 9941 ++It; 9942 9943 // thisMBB: 9944 // ... 9945 // TrueVal = ... 9946 // cmpTY ccX, r1, r2 9947 // bCC copy1MBB 9948 // fallthrough --> copy0MBB 9949 MachineBasicBlock *thisMBB = BB; 9950 MachineFunction *F = BB->getParent(); 9951 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 9952 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9953 F->insert(It, copy0MBB); 9954 F->insert(It, sinkMBB); 9955 9956 // If the EFLAGS register isn't dead in the terminator, then claim that it's 9957 // live into the sink and copy blocks. 9958 const MachineFunction *MF = BB->getParent(); 9959 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 9960 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 9961 9962 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 9963 const MachineOperand &MO = MI->getOperand(I); 9964 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 9965 unsigned Reg = MO.getReg(); 9966 if (Reg != X86::EFLAGS) continue; 9967 copy0MBB->addLiveIn(Reg); 9968 sinkMBB->addLiveIn(Reg); 9969 } 9970 9971 // Transfer the remainder of BB and its successor edges to sinkMBB. 9972 sinkMBB->splice(sinkMBB->begin(), BB, 9973 llvm::next(MachineBasicBlock::iterator(MI)), 9974 BB->end()); 9975 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9976 9977 // Add the true and fallthrough blocks as its successors. 9978 BB->addSuccessor(copy0MBB); 9979 BB->addSuccessor(sinkMBB); 9980 9981 // Create the conditional branch instruction. 9982 unsigned Opc = 9983 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 9984 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 9985 9986 // copy0MBB: 9987 // %FalseValue = ... 9988 // # fallthrough to sinkMBB 9989 copy0MBB->addSuccessor(sinkMBB); 9990 9991 // sinkMBB: 9992 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 9993 // ... 9994 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 9995 TII->get(X86::PHI), MI->getOperand(0).getReg()) 9996 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 9997 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 9998 9999 MI->eraseFromParent(); // The pseudo instruction is gone now. 10000 return sinkMBB; 10001} 10002 10003MachineBasicBlock * 10004X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 10005 MachineBasicBlock *BB) const { 10006 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10007 DebugLoc DL = MI->getDebugLoc(); 10008 10009 // The lowering is pretty easy: we're just emitting the call to _alloca. The 10010 // non-trivial part is impdef of ESP. 10011 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 10012 // mingw-w64. 10013 10014 const char *StackProbeSymbol = 10015 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 10016 10017 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 10018 .addExternalSymbol(StackProbeSymbol) 10019 .addReg(X86::EAX, RegState::Implicit) 10020 .addReg(X86::ESP, RegState::Implicit) 10021 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 10022 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 10023 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10024 10025 MI->eraseFromParent(); // The pseudo instruction is gone now. 10026 return BB; 10027} 10028 10029MachineBasicBlock * 10030X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 10031 MachineBasicBlock *BB) const { 10032 // This is pretty easy. We're taking the value that we received from 10033 // our load from the relocation, sticking it in either RDI (x86-64) 10034 // or EAX and doing an indirect call. The return value will then 10035 // be in the normal return register. 10036 const X86InstrInfo *TII 10037 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 10038 DebugLoc DL = MI->getDebugLoc(); 10039 MachineFunction *F = BB->getParent(); 10040 10041 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 10042 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 10043 10044 if (Subtarget->is64Bit()) { 10045 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10046 TII->get(X86::MOV64rm), X86::RDI) 10047 .addReg(X86::RIP) 10048 .addImm(0).addReg(0) 10049 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10050 MI->getOperand(3).getTargetFlags()) 10051 .addReg(0); 10052 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 10053 addDirectMem(MIB, X86::RDI); 10054 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 10055 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10056 TII->get(X86::MOV32rm), X86::EAX) 10057 .addReg(0) 10058 .addImm(0).addReg(0) 10059 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10060 MI->getOperand(3).getTargetFlags()) 10061 .addReg(0); 10062 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10063 addDirectMem(MIB, X86::EAX); 10064 } else { 10065 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10066 TII->get(X86::MOV32rm), X86::EAX) 10067 .addReg(TII->getGlobalBaseReg(F)) 10068 .addImm(0).addReg(0) 10069 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10070 MI->getOperand(3).getTargetFlags()) 10071 .addReg(0); 10072 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10073 addDirectMem(MIB, X86::EAX); 10074 } 10075 10076 MI->eraseFromParent(); // The pseudo instruction is gone now. 10077 return BB; 10078} 10079 10080MachineBasicBlock * 10081X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 10082 MachineBasicBlock *BB) const { 10083 switch (MI->getOpcode()) { 10084 default: assert(false && "Unexpected instr type to insert"); 10085 case X86::TAILJMPd64: 10086 case X86::TAILJMPr64: 10087 case X86::TAILJMPm64: 10088 assert(!"TAILJMP64 would not be touched here."); 10089 case X86::TCRETURNdi64: 10090 case X86::TCRETURNri64: 10091 case X86::TCRETURNmi64: 10092 // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. 10093 // On AMD64, additional defs should be added before register allocation. 10094 if (!Subtarget->isTargetWin64()) { 10095 MI->addRegisterDefined(X86::RSI); 10096 MI->addRegisterDefined(X86::RDI); 10097 MI->addRegisterDefined(X86::XMM6); 10098 MI->addRegisterDefined(X86::XMM7); 10099 MI->addRegisterDefined(X86::XMM8); 10100 MI->addRegisterDefined(X86::XMM9); 10101 MI->addRegisterDefined(X86::XMM10); 10102 MI->addRegisterDefined(X86::XMM11); 10103 MI->addRegisterDefined(X86::XMM12); 10104 MI->addRegisterDefined(X86::XMM13); 10105 MI->addRegisterDefined(X86::XMM14); 10106 MI->addRegisterDefined(X86::XMM15); 10107 } 10108 return BB; 10109 case X86::WIN_ALLOCA: 10110 return EmitLoweredWinAlloca(MI, BB); 10111 case X86::TLSCall_32: 10112 case X86::TLSCall_64: 10113 return EmitLoweredTLSCall(MI, BB); 10114 case X86::CMOV_GR8: 10115 case X86::CMOV_FR32: 10116 case X86::CMOV_FR64: 10117 case X86::CMOV_V4F32: 10118 case X86::CMOV_V2F64: 10119 case X86::CMOV_V2I64: 10120 case X86::CMOV_GR16: 10121 case X86::CMOV_GR32: 10122 case X86::CMOV_RFP32: 10123 case X86::CMOV_RFP64: 10124 case X86::CMOV_RFP80: 10125 return EmitLoweredSelect(MI, BB); 10126 10127 case X86::FP32_TO_INT16_IN_MEM: 10128 case X86::FP32_TO_INT32_IN_MEM: 10129 case X86::FP32_TO_INT64_IN_MEM: 10130 case X86::FP64_TO_INT16_IN_MEM: 10131 case X86::FP64_TO_INT32_IN_MEM: 10132 case X86::FP64_TO_INT64_IN_MEM: 10133 case X86::FP80_TO_INT16_IN_MEM: 10134 case X86::FP80_TO_INT32_IN_MEM: 10135 case X86::FP80_TO_INT64_IN_MEM: { 10136 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10137 DebugLoc DL = MI->getDebugLoc(); 10138 10139 // Change the floating point control register to use "round towards zero" 10140 // mode when truncating to an integer value. 10141 MachineFunction *F = BB->getParent(); 10142 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 10143 addFrameReference(BuildMI(*BB, MI, DL, 10144 TII->get(X86::FNSTCW16m)), CWFrameIdx); 10145 10146 // Load the old value of the high byte of the control word... 10147 unsigned OldCW = 10148 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 10149 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 10150 CWFrameIdx); 10151 10152 // Set the high part to be round to zero... 10153 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 10154 .addImm(0xC7F); 10155 10156 // Reload the modified control word now... 10157 addFrameReference(BuildMI(*BB, MI, DL, 10158 TII->get(X86::FLDCW16m)), CWFrameIdx); 10159 10160 // Restore the memory image of control word to original value 10161 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 10162 .addReg(OldCW); 10163 10164 // Get the X86 opcode to use. 10165 unsigned Opc; 10166 switch (MI->getOpcode()) { 10167 default: llvm_unreachable("illegal opcode!"); 10168 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 10169 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 10170 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 10171 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 10172 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 10173 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 10174 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 10175 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 10176 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 10177 } 10178 10179 X86AddressMode AM; 10180 MachineOperand &Op = MI->getOperand(0); 10181 if (Op.isReg()) { 10182 AM.BaseType = X86AddressMode::RegBase; 10183 AM.Base.Reg = Op.getReg(); 10184 } else { 10185 AM.BaseType = X86AddressMode::FrameIndexBase; 10186 AM.Base.FrameIndex = Op.getIndex(); 10187 } 10188 Op = MI->getOperand(1); 10189 if (Op.isImm()) 10190 AM.Scale = Op.getImm(); 10191 Op = MI->getOperand(2); 10192 if (Op.isImm()) 10193 AM.IndexReg = Op.getImm(); 10194 Op = MI->getOperand(3); 10195 if (Op.isGlobal()) { 10196 AM.GV = Op.getGlobal(); 10197 } else { 10198 AM.Disp = Op.getImm(); 10199 } 10200 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 10201 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 10202 10203 // Reload the original control word now. 10204 addFrameReference(BuildMI(*BB, MI, DL, 10205 TII->get(X86::FLDCW16m)), CWFrameIdx); 10206 10207 MI->eraseFromParent(); // The pseudo instruction is gone now. 10208 return BB; 10209 } 10210 // String/text processing lowering. 10211 case X86::PCMPISTRM128REG: 10212 case X86::VPCMPISTRM128REG: 10213 return EmitPCMP(MI, BB, 3, false /* in-mem */); 10214 case X86::PCMPISTRM128MEM: 10215 case X86::VPCMPISTRM128MEM: 10216 return EmitPCMP(MI, BB, 3, true /* in-mem */); 10217 case X86::PCMPESTRM128REG: 10218 case X86::VPCMPESTRM128REG: 10219 return EmitPCMP(MI, BB, 5, false /* in mem */); 10220 case X86::PCMPESTRM128MEM: 10221 case X86::VPCMPESTRM128MEM: 10222 return EmitPCMP(MI, BB, 5, true /* in mem */); 10223 10224 // Thread synchronization. 10225 case X86::MONITOR: 10226 return EmitMonitor(MI, BB); 10227 case X86::MWAIT: 10228 return EmitMwait(MI, BB); 10229 10230 // Atomic Lowering. 10231 case X86::ATOMAND32: 10232 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10233 X86::AND32ri, X86::MOV32rm, 10234 X86::LCMPXCHG32, 10235 X86::NOT32r, X86::EAX, 10236 X86::GR32RegisterClass); 10237 case X86::ATOMOR32: 10238 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 10239 X86::OR32ri, X86::MOV32rm, 10240 X86::LCMPXCHG32, 10241 X86::NOT32r, X86::EAX, 10242 X86::GR32RegisterClass); 10243 case X86::ATOMXOR32: 10244 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 10245 X86::XOR32ri, X86::MOV32rm, 10246 X86::LCMPXCHG32, 10247 X86::NOT32r, X86::EAX, 10248 X86::GR32RegisterClass); 10249 case X86::ATOMNAND32: 10250 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10251 X86::AND32ri, X86::MOV32rm, 10252 X86::LCMPXCHG32, 10253 X86::NOT32r, X86::EAX, 10254 X86::GR32RegisterClass, true); 10255 case X86::ATOMMIN32: 10256 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 10257 case X86::ATOMMAX32: 10258 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 10259 case X86::ATOMUMIN32: 10260 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 10261 case X86::ATOMUMAX32: 10262 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 10263 10264 case X86::ATOMAND16: 10265 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10266 X86::AND16ri, X86::MOV16rm, 10267 X86::LCMPXCHG16, 10268 X86::NOT16r, X86::AX, 10269 X86::GR16RegisterClass); 10270 case X86::ATOMOR16: 10271 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 10272 X86::OR16ri, X86::MOV16rm, 10273 X86::LCMPXCHG16, 10274 X86::NOT16r, X86::AX, 10275 X86::GR16RegisterClass); 10276 case X86::ATOMXOR16: 10277 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 10278 X86::XOR16ri, X86::MOV16rm, 10279 X86::LCMPXCHG16, 10280 X86::NOT16r, X86::AX, 10281 X86::GR16RegisterClass); 10282 case X86::ATOMNAND16: 10283 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10284 X86::AND16ri, X86::MOV16rm, 10285 X86::LCMPXCHG16, 10286 X86::NOT16r, X86::AX, 10287 X86::GR16RegisterClass, true); 10288 case X86::ATOMMIN16: 10289 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 10290 case X86::ATOMMAX16: 10291 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 10292 case X86::ATOMUMIN16: 10293 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 10294 case X86::ATOMUMAX16: 10295 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 10296 10297 case X86::ATOMAND8: 10298 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10299 X86::AND8ri, X86::MOV8rm, 10300 X86::LCMPXCHG8, 10301 X86::NOT8r, X86::AL, 10302 X86::GR8RegisterClass); 10303 case X86::ATOMOR8: 10304 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 10305 X86::OR8ri, X86::MOV8rm, 10306 X86::LCMPXCHG8, 10307 X86::NOT8r, X86::AL, 10308 X86::GR8RegisterClass); 10309 case X86::ATOMXOR8: 10310 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 10311 X86::XOR8ri, X86::MOV8rm, 10312 X86::LCMPXCHG8, 10313 X86::NOT8r, X86::AL, 10314 X86::GR8RegisterClass); 10315 case X86::ATOMNAND8: 10316 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10317 X86::AND8ri, X86::MOV8rm, 10318 X86::LCMPXCHG8, 10319 X86::NOT8r, X86::AL, 10320 X86::GR8RegisterClass, true); 10321 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 10322 // This group is for 64-bit host. 10323 case X86::ATOMAND64: 10324 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10325 X86::AND64ri32, X86::MOV64rm, 10326 X86::LCMPXCHG64, 10327 X86::NOT64r, X86::RAX, 10328 X86::GR64RegisterClass); 10329 case X86::ATOMOR64: 10330 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 10331 X86::OR64ri32, X86::MOV64rm, 10332 X86::LCMPXCHG64, 10333 X86::NOT64r, X86::RAX, 10334 X86::GR64RegisterClass); 10335 case X86::ATOMXOR64: 10336 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 10337 X86::XOR64ri32, X86::MOV64rm, 10338 X86::LCMPXCHG64, 10339 X86::NOT64r, X86::RAX, 10340 X86::GR64RegisterClass); 10341 case X86::ATOMNAND64: 10342 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10343 X86::AND64ri32, X86::MOV64rm, 10344 X86::LCMPXCHG64, 10345 X86::NOT64r, X86::RAX, 10346 X86::GR64RegisterClass, true); 10347 case X86::ATOMMIN64: 10348 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 10349 case X86::ATOMMAX64: 10350 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 10351 case X86::ATOMUMIN64: 10352 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 10353 case X86::ATOMUMAX64: 10354 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 10355 10356 // This group does 64-bit operations on a 32-bit host. 10357 case X86::ATOMAND6432: 10358 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10359 X86::AND32rr, X86::AND32rr, 10360 X86::AND32ri, X86::AND32ri, 10361 false); 10362 case X86::ATOMOR6432: 10363 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10364 X86::OR32rr, X86::OR32rr, 10365 X86::OR32ri, X86::OR32ri, 10366 false); 10367 case X86::ATOMXOR6432: 10368 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10369 X86::XOR32rr, X86::XOR32rr, 10370 X86::XOR32ri, X86::XOR32ri, 10371 false); 10372 case X86::ATOMNAND6432: 10373 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10374 X86::AND32rr, X86::AND32rr, 10375 X86::AND32ri, X86::AND32ri, 10376 true); 10377 case X86::ATOMADD6432: 10378 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10379 X86::ADD32rr, X86::ADC32rr, 10380 X86::ADD32ri, X86::ADC32ri, 10381 false); 10382 case X86::ATOMSUB6432: 10383 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10384 X86::SUB32rr, X86::SBB32rr, 10385 X86::SUB32ri, X86::SBB32ri, 10386 false); 10387 case X86::ATOMSWAP6432: 10388 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10389 X86::MOV32rr, X86::MOV32rr, 10390 X86::MOV32ri, X86::MOV32ri, 10391 false); 10392 case X86::VASTART_SAVE_XMM_REGS: 10393 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 10394 10395 case X86::VAARG_64: 10396 return EmitVAARG64WithCustomInserter(MI, BB); 10397 } 10398} 10399 10400//===----------------------------------------------------------------------===// 10401// X86 Optimization Hooks 10402//===----------------------------------------------------------------------===// 10403 10404void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 10405 const APInt &Mask, 10406 APInt &KnownZero, 10407 APInt &KnownOne, 10408 const SelectionDAG &DAG, 10409 unsigned Depth) const { 10410 unsigned Opc = Op.getOpcode(); 10411 assert((Opc >= ISD::BUILTIN_OP_END || 10412 Opc == ISD::INTRINSIC_WO_CHAIN || 10413 Opc == ISD::INTRINSIC_W_CHAIN || 10414 Opc == ISD::INTRINSIC_VOID) && 10415 "Should use MaskedValueIsZero if you don't know whether Op" 10416 " is a target node!"); 10417 10418 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 10419 switch (Opc) { 10420 default: break; 10421 case X86ISD::ADD: 10422 case X86ISD::SUB: 10423 case X86ISD::ADC: 10424 case X86ISD::SBB: 10425 case X86ISD::SMUL: 10426 case X86ISD::UMUL: 10427 case X86ISD::INC: 10428 case X86ISD::DEC: 10429 case X86ISD::OR: 10430 case X86ISD::XOR: 10431 case X86ISD::AND: 10432 // These nodes' second result is a boolean. 10433 if (Op.getResNo() == 0) 10434 break; 10435 // Fallthrough 10436 case X86ISD::SETCC: 10437 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 10438 Mask.getBitWidth() - 1); 10439 break; 10440 } 10441} 10442 10443unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 10444 unsigned Depth) const { 10445 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 10446 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 10447 return Op.getValueType().getScalarType().getSizeInBits(); 10448 10449 // Fallback case. 10450 return 1; 10451} 10452 10453/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 10454/// node is a GlobalAddress + offset. 10455bool X86TargetLowering::isGAPlusOffset(SDNode *N, 10456 const GlobalValue* &GA, 10457 int64_t &Offset) const { 10458 if (N->getOpcode() == X86ISD::Wrapper) { 10459 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 10460 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 10461 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 10462 return true; 10463 } 10464 } 10465 return TargetLowering::isGAPlusOffset(N, GA, Offset); 10466} 10467 10468/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 10469/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 10470/// if the load addresses are consecutive, non-overlapping, and in the right 10471/// order. 10472static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 10473 TargetLowering::DAGCombinerInfo &DCI) { 10474 DebugLoc dl = N->getDebugLoc(); 10475 EVT VT = N->getValueType(0); 10476 10477 if (VT.getSizeInBits() != 128) 10478 return SDValue(); 10479 10480 // Don't create instructions with illegal types after legalize types has run. 10481 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10482 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 10483 return SDValue(); 10484 10485 SmallVector<SDValue, 16> Elts; 10486 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 10487 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 10488 10489 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 10490} 10491 10492/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 10493/// generation and convert it from being a bunch of shuffles and extracts 10494/// to a simple store and scalar loads to extract the elements. 10495static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 10496 const TargetLowering &TLI) { 10497 SDValue InputVector = N->getOperand(0); 10498 10499 // Only operate on vectors of 4 elements, where the alternative shuffling 10500 // gets to be more expensive. 10501 if (InputVector.getValueType() != MVT::v4i32) 10502 return SDValue(); 10503 10504 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 10505 // single use which is a sign-extend or zero-extend, and all elements are 10506 // used. 10507 SmallVector<SDNode *, 4> Uses; 10508 unsigned ExtractedElements = 0; 10509 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 10510 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 10511 if (UI.getUse().getResNo() != InputVector.getResNo()) 10512 return SDValue(); 10513 10514 SDNode *Extract = *UI; 10515 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 10516 return SDValue(); 10517 10518 if (Extract->getValueType(0) != MVT::i32) 10519 return SDValue(); 10520 if (!Extract->hasOneUse()) 10521 return SDValue(); 10522 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 10523 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 10524 return SDValue(); 10525 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 10526 return SDValue(); 10527 10528 // Record which element was extracted. 10529 ExtractedElements |= 10530 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 10531 10532 Uses.push_back(Extract); 10533 } 10534 10535 // If not all the elements were used, this may not be worthwhile. 10536 if (ExtractedElements != 15) 10537 return SDValue(); 10538 10539 // Ok, we've now decided to do the transformation. 10540 DebugLoc dl = InputVector.getDebugLoc(); 10541 10542 // Store the value to a temporary stack slot. 10543 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 10544 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 10545 MachinePointerInfo(), false, false, 0); 10546 10547 // Replace each use (extract) with a load of the appropriate element. 10548 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 10549 UE = Uses.end(); UI != UE; ++UI) { 10550 SDNode *Extract = *UI; 10551 10552 // Compute the element's address. 10553 SDValue Idx = Extract->getOperand(1); 10554 unsigned EltSize = 10555 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 10556 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 10557 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 10558 10559 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 10560 StackPtr, OffsetVal); 10561 10562 // Load the scalar. 10563 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 10564 ScalarAddr, MachinePointerInfo(), 10565 false, false, 0); 10566 10567 // Replace the exact with the load. 10568 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 10569 } 10570 10571 // The replacement was made in place; don't return anything. 10572 return SDValue(); 10573} 10574 10575/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 10576static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 10577 const X86Subtarget *Subtarget) { 10578 DebugLoc DL = N->getDebugLoc(); 10579 SDValue Cond = N->getOperand(0); 10580 // Get the LHS/RHS of the select. 10581 SDValue LHS = N->getOperand(1); 10582 SDValue RHS = N->getOperand(2); 10583 10584 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 10585 // instructions match the semantics of the common C idiom x<y?x:y but not 10586 // x<=y?x:y, because of how they handle negative zero (which can be 10587 // ignored in unsafe-math mode). 10588 if (Subtarget->hasSSE2() && 10589 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 10590 Cond.getOpcode() == ISD::SETCC) { 10591 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 10592 10593 unsigned Opcode = 0; 10594 // Check for x CC y ? x : y. 10595 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 10596 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 10597 switch (CC) { 10598 default: break; 10599 case ISD::SETULT: 10600 // Converting this to a min would handle NaNs incorrectly, and swapping 10601 // the operands would cause it to handle comparisons between positive 10602 // and negative zero incorrectly. 10603 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10604 if (!UnsafeFPMath && 10605 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10606 break; 10607 std::swap(LHS, RHS); 10608 } 10609 Opcode = X86ISD::FMIN; 10610 break; 10611 case ISD::SETOLE: 10612 // Converting this to a min would handle comparisons between positive 10613 // and negative zero incorrectly. 10614 if (!UnsafeFPMath && 10615 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 10616 break; 10617 Opcode = X86ISD::FMIN; 10618 break; 10619 case ISD::SETULE: 10620 // Converting this to a min would handle both negative zeros and NaNs 10621 // incorrectly, but we can swap the operands to fix both. 10622 std::swap(LHS, RHS); 10623 case ISD::SETOLT: 10624 case ISD::SETLT: 10625 case ISD::SETLE: 10626 Opcode = X86ISD::FMIN; 10627 break; 10628 10629 case ISD::SETOGE: 10630 // Converting this to a max would handle comparisons between positive 10631 // and negative zero incorrectly. 10632 if (!UnsafeFPMath && 10633 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 10634 break; 10635 Opcode = X86ISD::FMAX; 10636 break; 10637 case ISD::SETUGT: 10638 // Converting this to a max would handle NaNs incorrectly, and swapping 10639 // the operands would cause it to handle comparisons between positive 10640 // and negative zero incorrectly. 10641 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10642 if (!UnsafeFPMath && 10643 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10644 break; 10645 std::swap(LHS, RHS); 10646 } 10647 Opcode = X86ISD::FMAX; 10648 break; 10649 case ISD::SETUGE: 10650 // Converting this to a max would handle both negative zeros and NaNs 10651 // incorrectly, but we can swap the operands to fix both. 10652 std::swap(LHS, RHS); 10653 case ISD::SETOGT: 10654 case ISD::SETGT: 10655 case ISD::SETGE: 10656 Opcode = X86ISD::FMAX; 10657 break; 10658 } 10659 // Check for x CC y ? y : x -- a min/max with reversed arms. 10660 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 10661 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 10662 switch (CC) { 10663 default: break; 10664 case ISD::SETOGE: 10665 // Converting this to a min would handle comparisons between positive 10666 // and negative zero incorrectly, and swapping the operands would 10667 // cause it to handle NaNs incorrectly. 10668 if (!UnsafeFPMath && 10669 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 10670 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10671 break; 10672 std::swap(LHS, RHS); 10673 } 10674 Opcode = X86ISD::FMIN; 10675 break; 10676 case ISD::SETUGT: 10677 // Converting this to a min would handle NaNs incorrectly. 10678 if (!UnsafeFPMath && 10679 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 10680 break; 10681 Opcode = X86ISD::FMIN; 10682 break; 10683 case ISD::SETUGE: 10684 // Converting this to a min would handle both negative zeros and NaNs 10685 // incorrectly, but we can swap the operands to fix both. 10686 std::swap(LHS, RHS); 10687 case ISD::SETOGT: 10688 case ISD::SETGT: 10689 case ISD::SETGE: 10690 Opcode = X86ISD::FMIN; 10691 break; 10692 10693 case ISD::SETULT: 10694 // Converting this to a max would handle NaNs incorrectly. 10695 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10696 break; 10697 Opcode = X86ISD::FMAX; 10698 break; 10699 case ISD::SETOLE: 10700 // Converting this to a max would handle comparisons between positive 10701 // and negative zero incorrectly, and swapping the operands would 10702 // cause it to handle NaNs incorrectly. 10703 if (!UnsafeFPMath && 10704 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 10705 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10706 break; 10707 std::swap(LHS, RHS); 10708 } 10709 Opcode = X86ISD::FMAX; 10710 break; 10711 case ISD::SETULE: 10712 // Converting this to a max would handle both negative zeros and NaNs 10713 // incorrectly, but we can swap the operands to fix both. 10714 std::swap(LHS, RHS); 10715 case ISD::SETOLT: 10716 case ISD::SETLT: 10717 case ISD::SETLE: 10718 Opcode = X86ISD::FMAX; 10719 break; 10720 } 10721 } 10722 10723 if (Opcode) 10724 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 10725 } 10726 10727 // If this is a select between two integer constants, try to do some 10728 // optimizations. 10729 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 10730 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 10731 // Don't do this for crazy integer types. 10732 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 10733 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 10734 // so that TrueC (the true value) is larger than FalseC. 10735 bool NeedsCondInvert = false; 10736 10737 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 10738 // Efficiently invertible. 10739 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 10740 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 10741 isa<ConstantSDNode>(Cond.getOperand(1))))) { 10742 NeedsCondInvert = true; 10743 std::swap(TrueC, FalseC); 10744 } 10745 10746 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 10747 if (FalseC->getAPIntValue() == 0 && 10748 TrueC->getAPIntValue().isPowerOf2()) { 10749 if (NeedsCondInvert) // Invert the condition if needed. 10750 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10751 DAG.getConstant(1, Cond.getValueType())); 10752 10753 // Zero extend the condition if needed. 10754 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 10755 10756 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10757 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 10758 DAG.getConstant(ShAmt, MVT::i8)); 10759 } 10760 10761 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 10762 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10763 if (NeedsCondInvert) // Invert the condition if needed. 10764 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10765 DAG.getConstant(1, Cond.getValueType())); 10766 10767 // Zero extend the condition if needed. 10768 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10769 FalseC->getValueType(0), Cond); 10770 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10771 SDValue(FalseC, 0)); 10772 } 10773 10774 // Optimize cases that will turn into an LEA instruction. This requires 10775 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10776 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10777 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10778 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10779 10780 bool isFastMultiplier = false; 10781 if (Diff < 10) { 10782 switch ((unsigned char)Diff) { 10783 default: break; 10784 case 1: // result = add base, cond 10785 case 2: // result = lea base( , cond*2) 10786 case 3: // result = lea base(cond, cond*2) 10787 case 4: // result = lea base( , cond*4) 10788 case 5: // result = lea base(cond, cond*4) 10789 case 8: // result = lea base( , cond*8) 10790 case 9: // result = lea base(cond, cond*8) 10791 isFastMultiplier = true; 10792 break; 10793 } 10794 } 10795 10796 if (isFastMultiplier) { 10797 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10798 if (NeedsCondInvert) // Invert the condition if needed. 10799 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10800 DAG.getConstant(1, Cond.getValueType())); 10801 10802 // Zero extend the condition if needed. 10803 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10804 Cond); 10805 // Scale the condition by the difference. 10806 if (Diff != 1) 10807 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10808 DAG.getConstant(Diff, Cond.getValueType())); 10809 10810 // Add the base if non-zero. 10811 if (FalseC->getAPIntValue() != 0) 10812 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10813 SDValue(FalseC, 0)); 10814 return Cond; 10815 } 10816 } 10817 } 10818 } 10819 10820 return SDValue(); 10821} 10822 10823/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 10824static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 10825 TargetLowering::DAGCombinerInfo &DCI) { 10826 DebugLoc DL = N->getDebugLoc(); 10827 10828 // If the flag operand isn't dead, don't touch this CMOV. 10829 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 10830 return SDValue(); 10831 10832 // If this is a select between two integer constants, try to do some 10833 // optimizations. Note that the operands are ordered the opposite of SELECT 10834 // operands. 10835 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 10836 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 10837 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 10838 // larger than FalseC (the false value). 10839 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 10840 10841 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 10842 CC = X86::GetOppositeBranchCondition(CC); 10843 std::swap(TrueC, FalseC); 10844 } 10845 10846 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 10847 // This is efficient for any integer data type (including i8/i16) and 10848 // shift amount. 10849 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 10850 SDValue Cond = N->getOperand(3); 10851 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10852 DAG.getConstant(CC, MVT::i8), Cond); 10853 10854 // Zero extend the condition if needed. 10855 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 10856 10857 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10858 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 10859 DAG.getConstant(ShAmt, MVT::i8)); 10860 if (N->getNumValues() == 2) // Dead flag value? 10861 return DCI.CombineTo(N, Cond, SDValue()); 10862 return Cond; 10863 } 10864 10865 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 10866 // for any integer data type, including i8/i16. 10867 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10868 SDValue Cond = N->getOperand(3); 10869 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10870 DAG.getConstant(CC, MVT::i8), Cond); 10871 10872 // Zero extend the condition if needed. 10873 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10874 FalseC->getValueType(0), Cond); 10875 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10876 SDValue(FalseC, 0)); 10877 10878 if (N->getNumValues() == 2) // Dead flag value? 10879 return DCI.CombineTo(N, Cond, SDValue()); 10880 return Cond; 10881 } 10882 10883 // Optimize cases that will turn into an LEA instruction. This requires 10884 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10885 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10886 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10887 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10888 10889 bool isFastMultiplier = false; 10890 if (Diff < 10) { 10891 switch ((unsigned char)Diff) { 10892 default: break; 10893 case 1: // result = add base, cond 10894 case 2: // result = lea base( , cond*2) 10895 case 3: // result = lea base(cond, cond*2) 10896 case 4: // result = lea base( , cond*4) 10897 case 5: // result = lea base(cond, cond*4) 10898 case 8: // result = lea base( , cond*8) 10899 case 9: // result = lea base(cond, cond*8) 10900 isFastMultiplier = true; 10901 break; 10902 } 10903 } 10904 10905 if (isFastMultiplier) { 10906 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10907 SDValue Cond = N->getOperand(3); 10908 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10909 DAG.getConstant(CC, MVT::i8), Cond); 10910 // Zero extend the condition if needed. 10911 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10912 Cond); 10913 // Scale the condition by the difference. 10914 if (Diff != 1) 10915 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10916 DAG.getConstant(Diff, Cond.getValueType())); 10917 10918 // Add the base if non-zero. 10919 if (FalseC->getAPIntValue() != 0) 10920 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10921 SDValue(FalseC, 0)); 10922 if (N->getNumValues() == 2) // Dead flag value? 10923 return DCI.CombineTo(N, Cond, SDValue()); 10924 return Cond; 10925 } 10926 } 10927 } 10928 } 10929 return SDValue(); 10930} 10931 10932 10933/// PerformMulCombine - Optimize a single multiply with constant into two 10934/// in order to implement it with two cheaper instructions, e.g. 10935/// LEA + SHL, LEA + LEA. 10936static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 10937 TargetLowering::DAGCombinerInfo &DCI) { 10938 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10939 return SDValue(); 10940 10941 EVT VT = N->getValueType(0); 10942 if (VT != MVT::i64) 10943 return SDValue(); 10944 10945 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10946 if (!C) 10947 return SDValue(); 10948 uint64_t MulAmt = C->getZExtValue(); 10949 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 10950 return SDValue(); 10951 10952 uint64_t MulAmt1 = 0; 10953 uint64_t MulAmt2 = 0; 10954 if ((MulAmt % 9) == 0) { 10955 MulAmt1 = 9; 10956 MulAmt2 = MulAmt / 9; 10957 } else if ((MulAmt % 5) == 0) { 10958 MulAmt1 = 5; 10959 MulAmt2 = MulAmt / 5; 10960 } else if ((MulAmt % 3) == 0) { 10961 MulAmt1 = 3; 10962 MulAmt2 = MulAmt / 3; 10963 } 10964 if (MulAmt2 && 10965 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 10966 DebugLoc DL = N->getDebugLoc(); 10967 10968 if (isPowerOf2_64(MulAmt2) && 10969 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 10970 // If second multiplifer is pow2, issue it first. We want the multiply by 10971 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 10972 // is an add. 10973 std::swap(MulAmt1, MulAmt2); 10974 10975 SDValue NewMul; 10976 if (isPowerOf2_64(MulAmt1)) 10977 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 10978 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 10979 else 10980 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 10981 DAG.getConstant(MulAmt1, VT)); 10982 10983 if (isPowerOf2_64(MulAmt2)) 10984 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 10985 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 10986 else 10987 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 10988 DAG.getConstant(MulAmt2, VT)); 10989 10990 // Do not add new nodes to DAG combiner worklist. 10991 DCI.CombineTo(N, NewMul, false); 10992 } 10993 return SDValue(); 10994} 10995 10996static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 10997 SDValue N0 = N->getOperand(0); 10998 SDValue N1 = N->getOperand(1); 10999 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 11000 EVT VT = N0.getValueType(); 11001 11002 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 11003 // since the result of setcc_c is all zero's or all ones. 11004 if (N1C && N0.getOpcode() == ISD::AND && 11005 N0.getOperand(1).getOpcode() == ISD::Constant) { 11006 SDValue N00 = N0.getOperand(0); 11007 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 11008 ((N00.getOpcode() == ISD::ANY_EXTEND || 11009 N00.getOpcode() == ISD::ZERO_EXTEND) && 11010 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 11011 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 11012 APInt ShAmt = N1C->getAPIntValue(); 11013 Mask = Mask.shl(ShAmt); 11014 if (Mask != 0) 11015 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 11016 N00, DAG.getConstant(Mask, VT)); 11017 } 11018 } 11019 11020 return SDValue(); 11021} 11022 11023/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 11024/// when possible. 11025static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 11026 const X86Subtarget *Subtarget) { 11027 EVT VT = N->getValueType(0); 11028 if (!VT.isVector() && VT.isInteger() && 11029 N->getOpcode() == ISD::SHL) 11030 return PerformSHLCombine(N, DAG); 11031 11032 // On X86 with SSE2 support, we can transform this to a vector shift if 11033 // all elements are shifted by the same amount. We can't do this in legalize 11034 // because the a constant vector is typically transformed to a constant pool 11035 // so we have no knowledge of the shift amount. 11036 if (!Subtarget->hasSSE2()) 11037 return SDValue(); 11038 11039 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 11040 return SDValue(); 11041 11042 SDValue ShAmtOp = N->getOperand(1); 11043 EVT EltVT = VT.getVectorElementType(); 11044 DebugLoc DL = N->getDebugLoc(); 11045 SDValue BaseShAmt = SDValue(); 11046 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 11047 unsigned NumElts = VT.getVectorNumElements(); 11048 unsigned i = 0; 11049 for (; i != NumElts; ++i) { 11050 SDValue Arg = ShAmtOp.getOperand(i); 11051 if (Arg.getOpcode() == ISD::UNDEF) continue; 11052 BaseShAmt = Arg; 11053 break; 11054 } 11055 for (; i != NumElts; ++i) { 11056 SDValue Arg = ShAmtOp.getOperand(i); 11057 if (Arg.getOpcode() == ISD::UNDEF) continue; 11058 if (Arg != BaseShAmt) { 11059 return SDValue(); 11060 } 11061 } 11062 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 11063 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 11064 SDValue InVec = ShAmtOp.getOperand(0); 11065 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 11066 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 11067 unsigned i = 0; 11068 for (; i != NumElts; ++i) { 11069 SDValue Arg = InVec.getOperand(i); 11070 if (Arg.getOpcode() == ISD::UNDEF) continue; 11071 BaseShAmt = Arg; 11072 break; 11073 } 11074 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 11075 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 11076 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 11077 if (C->getZExtValue() == SplatIdx) 11078 BaseShAmt = InVec.getOperand(1); 11079 } 11080 } 11081 if (BaseShAmt.getNode() == 0) 11082 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 11083 DAG.getIntPtrConstant(0)); 11084 } else 11085 return SDValue(); 11086 11087 // The shift amount is an i32. 11088 if (EltVT.bitsGT(MVT::i32)) 11089 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 11090 else if (EltVT.bitsLT(MVT::i32)) 11091 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 11092 11093 // The shift amount is identical so we can do a vector shift. 11094 SDValue ValOp = N->getOperand(0); 11095 switch (N->getOpcode()) { 11096 default: 11097 llvm_unreachable("Unknown shift opcode!"); 11098 break; 11099 case ISD::SHL: 11100 if (VT == MVT::v2i64) 11101 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11102 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 11103 ValOp, BaseShAmt); 11104 if (VT == MVT::v4i32) 11105 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11106 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 11107 ValOp, BaseShAmt); 11108 if (VT == MVT::v8i16) 11109 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11110 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 11111 ValOp, BaseShAmt); 11112 break; 11113 case ISD::SRA: 11114 if (VT == MVT::v4i32) 11115 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11116 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 11117 ValOp, BaseShAmt); 11118 if (VT == MVT::v8i16) 11119 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11120 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 11121 ValOp, BaseShAmt); 11122 break; 11123 case ISD::SRL: 11124 if (VT == MVT::v2i64) 11125 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11126 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 11127 ValOp, BaseShAmt); 11128 if (VT == MVT::v4i32) 11129 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11130 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 11131 ValOp, BaseShAmt); 11132 if (VT == MVT::v8i16) 11133 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11134 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 11135 ValOp, BaseShAmt); 11136 break; 11137 } 11138 return SDValue(); 11139} 11140 11141 11142static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 11143 TargetLowering::DAGCombinerInfo &DCI, 11144 const X86Subtarget *Subtarget) { 11145 if (DCI.isBeforeLegalizeOps()) 11146 return SDValue(); 11147 11148 // Want to form PANDN nodes, in the hopes of then easily combining them with 11149 // OR and AND nodes to form PBLEND/PSIGN. 11150 EVT VT = N->getValueType(0); 11151 if (VT != MVT::v2i64) 11152 return SDValue(); 11153 11154 SDValue N0 = N->getOperand(0); 11155 SDValue N1 = N->getOperand(1); 11156 DebugLoc DL = N->getDebugLoc(); 11157 11158 // Check LHS for vnot 11159 if (N0.getOpcode() == ISD::XOR && 11160 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 11161 return DAG.getNode(X86ISD::PANDN, DL, VT, N0.getOperand(0), N1); 11162 11163 // Check RHS for vnot 11164 if (N1.getOpcode() == ISD::XOR && 11165 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 11166 return DAG.getNode(X86ISD::PANDN, DL, VT, N1.getOperand(0), N0); 11167 11168 return SDValue(); 11169} 11170 11171static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 11172 TargetLowering::DAGCombinerInfo &DCI, 11173 const X86Subtarget *Subtarget) { 11174 if (DCI.isBeforeLegalizeOps()) 11175 return SDValue(); 11176 11177 EVT VT = N->getValueType(0); 11178 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) 11179 return SDValue(); 11180 11181 SDValue N0 = N->getOperand(0); 11182 SDValue N1 = N->getOperand(1); 11183 11184 // look for psign/blend 11185 if (Subtarget->hasSSSE3()) { 11186 if (VT == MVT::v2i64) { 11187 // Canonicalize pandn to RHS 11188 if (N0.getOpcode() == X86ISD::PANDN) 11189 std::swap(N0, N1); 11190 // or (and (m, x), (pandn m, y)) 11191 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::PANDN) { 11192 SDValue Mask = N1.getOperand(0); 11193 SDValue X = N1.getOperand(1); 11194 SDValue Y; 11195 if (N0.getOperand(0) == Mask) 11196 Y = N0.getOperand(1); 11197 if (N0.getOperand(1) == Mask) 11198 Y = N0.getOperand(0); 11199 11200 // Check to see if the mask appeared in both the AND and PANDN and 11201 if (!Y.getNode()) 11202 return SDValue(); 11203 11204 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 11205 if (Mask.getOpcode() != ISD::BITCAST || 11206 X.getOpcode() != ISD::BITCAST || 11207 Y.getOpcode() != ISD::BITCAST) 11208 return SDValue(); 11209 11210 // Look through mask bitcast. 11211 Mask = Mask.getOperand(0); 11212 EVT MaskVT = Mask.getValueType(); 11213 11214 // Validate that the Mask operand is a vector sra node. The sra node 11215 // will be an intrinsic. 11216 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 11217 return SDValue(); 11218 11219 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 11220 // there is no psrai.b 11221 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 11222 case Intrinsic::x86_sse2_psrai_w: 11223 case Intrinsic::x86_sse2_psrai_d: 11224 break; 11225 default: return SDValue(); 11226 } 11227 11228 // Check that the SRA is all signbits. 11229 SDValue SraC = Mask.getOperand(2); 11230 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 11231 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 11232 if ((SraAmt + 1) != EltBits) 11233 return SDValue(); 11234 11235 DebugLoc DL = N->getDebugLoc(); 11236 11237 // Now we know we at least have a plendvb with the mask val. See if 11238 // we can form a psignb/w/d. 11239 // psign = x.type == y.type == mask.type && y = sub(0, x); 11240 X = X.getOperand(0); 11241 Y = Y.getOperand(0); 11242 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 11243 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 11244 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ 11245 unsigned Opc = 0; 11246 switch (EltBits) { 11247 case 8: Opc = X86ISD::PSIGNB; break; 11248 case 16: Opc = X86ISD::PSIGNW; break; 11249 case 32: Opc = X86ISD::PSIGND; break; 11250 default: break; 11251 } 11252 if (Opc) { 11253 SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); 11254 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); 11255 } 11256 } 11257 // PBLENDVB only available on SSE 4.1 11258 if (!Subtarget->hasSSE41()) 11259 return SDValue(); 11260 11261 X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); 11262 Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); 11263 Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); 11264 Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); 11265 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); 11266 } 11267 } 11268 } 11269 11270 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 11271 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 11272 std::swap(N0, N1); 11273 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 11274 return SDValue(); 11275 if (!N0.hasOneUse() || !N1.hasOneUse()) 11276 return SDValue(); 11277 11278 SDValue ShAmt0 = N0.getOperand(1); 11279 if (ShAmt0.getValueType() != MVT::i8) 11280 return SDValue(); 11281 SDValue ShAmt1 = N1.getOperand(1); 11282 if (ShAmt1.getValueType() != MVT::i8) 11283 return SDValue(); 11284 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 11285 ShAmt0 = ShAmt0.getOperand(0); 11286 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 11287 ShAmt1 = ShAmt1.getOperand(0); 11288 11289 DebugLoc DL = N->getDebugLoc(); 11290 unsigned Opc = X86ISD::SHLD; 11291 SDValue Op0 = N0.getOperand(0); 11292 SDValue Op1 = N1.getOperand(0); 11293 if (ShAmt0.getOpcode() == ISD::SUB) { 11294 Opc = X86ISD::SHRD; 11295 std::swap(Op0, Op1); 11296 std::swap(ShAmt0, ShAmt1); 11297 } 11298 11299 unsigned Bits = VT.getSizeInBits(); 11300 if (ShAmt1.getOpcode() == ISD::SUB) { 11301 SDValue Sum = ShAmt1.getOperand(0); 11302 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 11303 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 11304 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 11305 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 11306 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 11307 return DAG.getNode(Opc, DL, VT, 11308 Op0, Op1, 11309 DAG.getNode(ISD::TRUNCATE, DL, 11310 MVT::i8, ShAmt0)); 11311 } 11312 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 11313 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 11314 if (ShAmt0C && 11315 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 11316 return DAG.getNode(Opc, DL, VT, 11317 N0.getOperand(0), N1.getOperand(0), 11318 DAG.getNode(ISD::TRUNCATE, DL, 11319 MVT::i8, ShAmt0)); 11320 } 11321 11322 return SDValue(); 11323} 11324 11325/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 11326static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 11327 const X86Subtarget *Subtarget) { 11328 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 11329 // the FP state in cases where an emms may be missing. 11330 // A preferable solution to the general problem is to figure out the right 11331 // places to insert EMMS. This qualifies as a quick hack. 11332 11333 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 11334 StoreSDNode *St = cast<StoreSDNode>(N); 11335 EVT VT = St->getValue().getValueType(); 11336 if (VT.getSizeInBits() != 64) 11337 return SDValue(); 11338 11339 const Function *F = DAG.getMachineFunction().getFunction(); 11340 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 11341 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 11342 && Subtarget->hasSSE2(); 11343 if ((VT.isVector() || 11344 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 11345 isa<LoadSDNode>(St->getValue()) && 11346 !cast<LoadSDNode>(St->getValue())->isVolatile() && 11347 St->getChain().hasOneUse() && !St->isVolatile()) { 11348 SDNode* LdVal = St->getValue().getNode(); 11349 LoadSDNode *Ld = 0; 11350 int TokenFactorIndex = -1; 11351 SmallVector<SDValue, 8> Ops; 11352 SDNode* ChainVal = St->getChain().getNode(); 11353 // Must be a store of a load. We currently handle two cases: the load 11354 // is a direct child, and it's under an intervening TokenFactor. It is 11355 // possible to dig deeper under nested TokenFactors. 11356 if (ChainVal == LdVal) 11357 Ld = cast<LoadSDNode>(St->getChain()); 11358 else if (St->getValue().hasOneUse() && 11359 ChainVal->getOpcode() == ISD::TokenFactor) { 11360 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 11361 if (ChainVal->getOperand(i).getNode() == LdVal) { 11362 TokenFactorIndex = i; 11363 Ld = cast<LoadSDNode>(St->getValue()); 11364 } else 11365 Ops.push_back(ChainVal->getOperand(i)); 11366 } 11367 } 11368 11369 if (!Ld || !ISD::isNormalLoad(Ld)) 11370 return SDValue(); 11371 11372 // If this is not the MMX case, i.e. we are just turning i64 load/store 11373 // into f64 load/store, avoid the transformation if there are multiple 11374 // uses of the loaded value. 11375 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 11376 return SDValue(); 11377 11378 DebugLoc LdDL = Ld->getDebugLoc(); 11379 DebugLoc StDL = N->getDebugLoc(); 11380 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 11381 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 11382 // pair instead. 11383 if (Subtarget->is64Bit() || F64IsLegal) { 11384 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 11385 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 11386 Ld->getPointerInfo(), Ld->isVolatile(), 11387 Ld->isNonTemporal(), Ld->getAlignment()); 11388 SDValue NewChain = NewLd.getValue(1); 11389 if (TokenFactorIndex != -1) { 11390 Ops.push_back(NewChain); 11391 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11392 Ops.size()); 11393 } 11394 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 11395 St->getPointerInfo(), 11396 St->isVolatile(), St->isNonTemporal(), 11397 St->getAlignment()); 11398 } 11399 11400 // Otherwise, lower to two pairs of 32-bit loads / stores. 11401 SDValue LoAddr = Ld->getBasePtr(); 11402 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 11403 DAG.getConstant(4, MVT::i32)); 11404 11405 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 11406 Ld->getPointerInfo(), 11407 Ld->isVolatile(), Ld->isNonTemporal(), 11408 Ld->getAlignment()); 11409 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 11410 Ld->getPointerInfo().getWithOffset(4), 11411 Ld->isVolatile(), Ld->isNonTemporal(), 11412 MinAlign(Ld->getAlignment(), 4)); 11413 11414 SDValue NewChain = LoLd.getValue(1); 11415 if (TokenFactorIndex != -1) { 11416 Ops.push_back(LoLd); 11417 Ops.push_back(HiLd); 11418 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11419 Ops.size()); 11420 } 11421 11422 LoAddr = St->getBasePtr(); 11423 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 11424 DAG.getConstant(4, MVT::i32)); 11425 11426 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 11427 St->getPointerInfo(), 11428 St->isVolatile(), St->isNonTemporal(), 11429 St->getAlignment()); 11430 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 11431 St->getPointerInfo().getWithOffset(4), 11432 St->isVolatile(), 11433 St->isNonTemporal(), 11434 MinAlign(St->getAlignment(), 4)); 11435 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 11436 } 11437 return SDValue(); 11438} 11439 11440/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 11441/// X86ISD::FXOR nodes. 11442static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 11443 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 11444 // F[X]OR(0.0, x) -> x 11445 // F[X]OR(x, 0.0) -> x 11446 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11447 if (C->getValueAPF().isPosZero()) 11448 return N->getOperand(1); 11449 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11450 if (C->getValueAPF().isPosZero()) 11451 return N->getOperand(0); 11452 return SDValue(); 11453} 11454 11455/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 11456static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 11457 // FAND(0.0, x) -> 0.0 11458 // FAND(x, 0.0) -> 0.0 11459 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11460 if (C->getValueAPF().isPosZero()) 11461 return N->getOperand(0); 11462 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11463 if (C->getValueAPF().isPosZero()) 11464 return N->getOperand(1); 11465 return SDValue(); 11466} 11467 11468static SDValue PerformBTCombine(SDNode *N, 11469 SelectionDAG &DAG, 11470 TargetLowering::DAGCombinerInfo &DCI) { 11471 // BT ignores high bits in the bit index operand. 11472 SDValue Op1 = N->getOperand(1); 11473 if (Op1.hasOneUse()) { 11474 unsigned BitWidth = Op1.getValueSizeInBits(); 11475 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 11476 APInt KnownZero, KnownOne; 11477 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 11478 !DCI.isBeforeLegalizeOps()); 11479 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11480 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 11481 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 11482 DCI.CommitTargetLoweringOpt(TLO); 11483 } 11484 return SDValue(); 11485} 11486 11487static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 11488 SDValue Op = N->getOperand(0); 11489 if (Op.getOpcode() == ISD::BITCAST) 11490 Op = Op.getOperand(0); 11491 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 11492 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 11493 VT.getVectorElementType().getSizeInBits() == 11494 OpVT.getVectorElementType().getSizeInBits()) { 11495 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 11496 } 11497 return SDValue(); 11498} 11499 11500static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 11501 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 11502 // (and (i32 x86isd::setcc_carry), 1) 11503 // This eliminates the zext. This transformation is necessary because 11504 // ISD::SETCC is always legalized to i8. 11505 DebugLoc dl = N->getDebugLoc(); 11506 SDValue N0 = N->getOperand(0); 11507 EVT VT = N->getValueType(0); 11508 if (N0.getOpcode() == ISD::AND && 11509 N0.hasOneUse() && 11510 N0.getOperand(0).hasOneUse()) { 11511 SDValue N00 = N0.getOperand(0); 11512 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 11513 return SDValue(); 11514 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 11515 if (!C || C->getZExtValue() != 1) 11516 return SDValue(); 11517 return DAG.getNode(ISD::AND, dl, VT, 11518 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 11519 N00.getOperand(0), N00.getOperand(1)), 11520 DAG.getConstant(1, VT)); 11521 } 11522 11523 return SDValue(); 11524} 11525 11526// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 11527static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 11528 unsigned X86CC = N->getConstantOperandVal(0); 11529 SDValue EFLAG = N->getOperand(1); 11530 DebugLoc DL = N->getDebugLoc(); 11531 11532 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 11533 // a zext and produces an all-ones bit which is more useful than 0/1 in some 11534 // cases. 11535 if (X86CC == X86::COND_B) 11536 return DAG.getNode(ISD::AND, DL, MVT::i8, 11537 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 11538 DAG.getConstant(X86CC, MVT::i8), EFLAG), 11539 DAG.getConstant(1, MVT::i8)); 11540 11541 return SDValue(); 11542} 11543 11544// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 11545static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 11546 X86TargetLowering::DAGCombinerInfo &DCI) { 11547 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 11548 // the result is either zero or one (depending on the input carry bit). 11549 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 11550 if (X86::isZeroNode(N->getOperand(0)) && 11551 X86::isZeroNode(N->getOperand(1)) && 11552 // We don't have a good way to replace an EFLAGS use, so only do this when 11553 // dead right now. 11554 SDValue(N, 1).use_empty()) { 11555 DebugLoc DL = N->getDebugLoc(); 11556 EVT VT = N->getValueType(0); 11557 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 11558 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 11559 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 11560 DAG.getConstant(X86::COND_B,MVT::i8), 11561 N->getOperand(2)), 11562 DAG.getConstant(1, VT)); 11563 return DCI.CombineTo(N, Res1, CarryOut); 11564 } 11565 11566 return SDValue(); 11567} 11568 11569// fold (add Y, (sete X, 0)) -> adc 0, Y 11570// (add Y, (setne X, 0)) -> sbb -1, Y 11571// (sub (sete X, 0), Y) -> sbb 0, Y 11572// (sub (setne X, 0), Y) -> adc -1, Y 11573static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) { 11574 DebugLoc DL = N->getDebugLoc(); 11575 11576 // Look through ZExts. 11577 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 11578 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 11579 return SDValue(); 11580 11581 SDValue SetCC = Ext.getOperand(0); 11582 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 11583 return SDValue(); 11584 11585 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 11586 if (CC != X86::COND_E && CC != X86::COND_NE) 11587 return SDValue(); 11588 11589 SDValue Cmp = SetCC.getOperand(1); 11590 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 11591 !X86::isZeroNode(Cmp.getOperand(1)) || 11592 !Cmp.getOperand(0).getValueType().isInteger()) 11593 return SDValue(); 11594 11595 SDValue CmpOp0 = Cmp.getOperand(0); 11596 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 11597 DAG.getConstant(1, CmpOp0.getValueType())); 11598 11599 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 11600 if (CC == X86::COND_NE) 11601 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 11602 DL, OtherVal.getValueType(), OtherVal, 11603 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 11604 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 11605 DL, OtherVal.getValueType(), OtherVal, 11606 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 11607} 11608 11609SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 11610 DAGCombinerInfo &DCI) const { 11611 SelectionDAG &DAG = DCI.DAG; 11612 switch (N->getOpcode()) { 11613 default: break; 11614 case ISD::EXTRACT_VECTOR_ELT: 11615 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 11616 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 11617 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 11618 case ISD::ADD: 11619 case ISD::SUB: return OptimizeConditonalInDecrement(N, DAG); 11620 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 11621 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 11622 case ISD::SHL: 11623 case ISD::SRA: 11624 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 11625 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 11626 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 11627 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 11628 case X86ISD::FXOR: 11629 case X86ISD::FOR: return PerformFORCombine(N, DAG); 11630 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 11631 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 11632 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 11633 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 11634 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 11635 case X86ISD::SHUFPS: // Handle all target specific shuffles 11636 case X86ISD::SHUFPD: 11637 case X86ISD::PALIGN: 11638 case X86ISD::PUNPCKHBW: 11639 case X86ISD::PUNPCKHWD: 11640 case X86ISD::PUNPCKHDQ: 11641 case X86ISD::PUNPCKHQDQ: 11642 case X86ISD::UNPCKHPS: 11643 case X86ISD::UNPCKHPD: 11644 case X86ISD::PUNPCKLBW: 11645 case X86ISD::PUNPCKLWD: 11646 case X86ISD::PUNPCKLDQ: 11647 case X86ISD::PUNPCKLQDQ: 11648 case X86ISD::UNPCKLPS: 11649 case X86ISD::UNPCKLPD: 11650 case X86ISD::MOVHLPS: 11651 case X86ISD::MOVLHPS: 11652 case X86ISD::PSHUFD: 11653 case X86ISD::PSHUFHW: 11654 case X86ISD::PSHUFLW: 11655 case X86ISD::MOVSS: 11656 case X86ISD::MOVSD: 11657 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); 11658 } 11659 11660 return SDValue(); 11661} 11662 11663/// isTypeDesirableForOp - Return true if the target has native support for 11664/// the specified value type and it is 'desirable' to use the type for the 11665/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 11666/// instruction encodings are longer and some i16 instructions are slow. 11667bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 11668 if (!isTypeLegal(VT)) 11669 return false; 11670 if (VT != MVT::i16) 11671 return true; 11672 11673 switch (Opc) { 11674 default: 11675 return true; 11676 case ISD::LOAD: 11677 case ISD::SIGN_EXTEND: 11678 case ISD::ZERO_EXTEND: 11679 case ISD::ANY_EXTEND: 11680 case ISD::SHL: 11681 case ISD::SRL: 11682 case ISD::SUB: 11683 case ISD::ADD: 11684 case ISD::MUL: 11685 case ISD::AND: 11686 case ISD::OR: 11687 case ISD::XOR: 11688 return false; 11689 } 11690} 11691 11692/// IsDesirableToPromoteOp - This method query the target whether it is 11693/// beneficial for dag combiner to promote the specified node. If true, it 11694/// should return the desired promotion type by reference. 11695bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 11696 EVT VT = Op.getValueType(); 11697 if (VT != MVT::i16) 11698 return false; 11699 11700 bool Promote = false; 11701 bool Commute = false; 11702 switch (Op.getOpcode()) { 11703 default: break; 11704 case ISD::LOAD: { 11705 LoadSDNode *LD = cast<LoadSDNode>(Op); 11706 // If the non-extending load has a single use and it's not live out, then it 11707 // might be folded. 11708 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 11709 Op.hasOneUse()*/) { 11710 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 11711 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 11712 // The only case where we'd want to promote LOAD (rather then it being 11713 // promoted as an operand is when it's only use is liveout. 11714 if (UI->getOpcode() != ISD::CopyToReg) 11715 return false; 11716 } 11717 } 11718 Promote = true; 11719 break; 11720 } 11721 case ISD::SIGN_EXTEND: 11722 case ISD::ZERO_EXTEND: 11723 case ISD::ANY_EXTEND: 11724 Promote = true; 11725 break; 11726 case ISD::SHL: 11727 case ISD::SRL: { 11728 SDValue N0 = Op.getOperand(0); 11729 // Look out for (store (shl (load), x)). 11730 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 11731 return false; 11732 Promote = true; 11733 break; 11734 } 11735 case ISD::ADD: 11736 case ISD::MUL: 11737 case ISD::AND: 11738 case ISD::OR: 11739 case ISD::XOR: 11740 Commute = true; 11741 // fallthrough 11742 case ISD::SUB: { 11743 SDValue N0 = Op.getOperand(0); 11744 SDValue N1 = Op.getOperand(1); 11745 if (!Commute && MayFoldLoad(N1)) 11746 return false; 11747 // Avoid disabling potential load folding opportunities. 11748 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 11749 return false; 11750 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 11751 return false; 11752 Promote = true; 11753 } 11754 } 11755 11756 PVT = MVT::i32; 11757 return Promote; 11758} 11759 11760//===----------------------------------------------------------------------===// 11761// X86 Inline Assembly Support 11762//===----------------------------------------------------------------------===// 11763 11764bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 11765 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 11766 11767 std::string AsmStr = IA->getAsmString(); 11768 11769 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 11770 SmallVector<StringRef, 4> AsmPieces; 11771 SplitString(AsmStr, AsmPieces, ";\n"); 11772 11773 switch (AsmPieces.size()) { 11774 default: return false; 11775 case 1: 11776 AsmStr = AsmPieces[0]; 11777 AsmPieces.clear(); 11778 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 11779 11780 // FIXME: this should verify that we are targetting a 486 or better. If not, 11781 // we will turn this bswap into something that will be lowered to logical ops 11782 // instead of emitting the bswap asm. For now, we don't support 486 or lower 11783 // so don't worry about this. 11784 // bswap $0 11785 if (AsmPieces.size() == 2 && 11786 (AsmPieces[0] == "bswap" || 11787 AsmPieces[0] == "bswapq" || 11788 AsmPieces[0] == "bswapl") && 11789 (AsmPieces[1] == "$0" || 11790 AsmPieces[1] == "${0:q}")) { 11791 // No need to check constraints, nothing other than the equivalent of 11792 // "=r,0" would be valid here. 11793 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11794 if (!Ty || Ty->getBitWidth() % 16 != 0) 11795 return false; 11796 return IntrinsicLowering::LowerToByteSwap(CI); 11797 } 11798 // rorw $$8, ${0:w} --> llvm.bswap.i16 11799 if (CI->getType()->isIntegerTy(16) && 11800 AsmPieces.size() == 3 && 11801 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 11802 AsmPieces[1] == "$$8," && 11803 AsmPieces[2] == "${0:w}" && 11804 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 11805 AsmPieces.clear(); 11806 const std::string &ConstraintsStr = IA->getConstraintString(); 11807 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 11808 std::sort(AsmPieces.begin(), AsmPieces.end()); 11809 if (AsmPieces.size() == 4 && 11810 AsmPieces[0] == "~{cc}" && 11811 AsmPieces[1] == "~{dirflag}" && 11812 AsmPieces[2] == "~{flags}" && 11813 AsmPieces[3] == "~{fpsr}") { 11814 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11815 if (!Ty || Ty->getBitWidth() % 16 != 0) 11816 return false; 11817 return IntrinsicLowering::LowerToByteSwap(CI); 11818 } 11819 } 11820 break; 11821 case 3: 11822 if (CI->getType()->isIntegerTy(32) && 11823 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 11824 SmallVector<StringRef, 4> Words; 11825 SplitString(AsmPieces[0], Words, " \t,"); 11826 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 11827 Words[2] == "${0:w}") { 11828 Words.clear(); 11829 SplitString(AsmPieces[1], Words, " \t,"); 11830 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 11831 Words[2] == "$0") { 11832 Words.clear(); 11833 SplitString(AsmPieces[2], Words, " \t,"); 11834 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 11835 Words[2] == "${0:w}") { 11836 AsmPieces.clear(); 11837 const std::string &ConstraintsStr = IA->getConstraintString(); 11838 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 11839 std::sort(AsmPieces.begin(), AsmPieces.end()); 11840 if (AsmPieces.size() == 4 && 11841 AsmPieces[0] == "~{cc}" && 11842 AsmPieces[1] == "~{dirflag}" && 11843 AsmPieces[2] == "~{flags}" && 11844 AsmPieces[3] == "~{fpsr}") { 11845 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11846 if (!Ty || Ty->getBitWidth() % 16 != 0) 11847 return false; 11848 return IntrinsicLowering::LowerToByteSwap(CI); 11849 } 11850 } 11851 } 11852 } 11853 } 11854 11855 if (CI->getType()->isIntegerTy(64)) { 11856 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 11857 if (Constraints.size() >= 2 && 11858 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 11859 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 11860 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 11861 SmallVector<StringRef, 4> Words; 11862 SplitString(AsmPieces[0], Words, " \t"); 11863 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 11864 Words.clear(); 11865 SplitString(AsmPieces[1], Words, " \t"); 11866 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 11867 Words.clear(); 11868 SplitString(AsmPieces[2], Words, " \t,"); 11869 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 11870 Words[2] == "%edx") { 11871 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11872 if (!Ty || Ty->getBitWidth() % 16 != 0) 11873 return false; 11874 return IntrinsicLowering::LowerToByteSwap(CI); 11875 } 11876 } 11877 } 11878 } 11879 } 11880 break; 11881 } 11882 return false; 11883} 11884 11885 11886 11887/// getConstraintType - Given a constraint letter, return the type of 11888/// constraint it is for this target. 11889X86TargetLowering::ConstraintType 11890X86TargetLowering::getConstraintType(const std::string &Constraint) const { 11891 if (Constraint.size() == 1) { 11892 switch (Constraint[0]) { 11893 case 'R': 11894 case 'q': 11895 case 'Q': 11896 case 'f': 11897 case 't': 11898 case 'u': 11899 case 'y': 11900 case 'x': 11901 case 'Y': 11902 return C_RegisterClass; 11903 case 'a': 11904 case 'b': 11905 case 'c': 11906 case 'd': 11907 case 'S': 11908 case 'D': 11909 case 'A': 11910 return C_Register; 11911 case 'I': 11912 case 'J': 11913 case 'K': 11914 case 'L': 11915 case 'M': 11916 case 'N': 11917 case 'G': 11918 case 'C': 11919 case 'e': 11920 case 'Z': 11921 return C_Other; 11922 default: 11923 break; 11924 } 11925 } 11926 return TargetLowering::getConstraintType(Constraint); 11927} 11928 11929/// Examine constraint type and operand type and determine a weight value. 11930/// This object must already have been set up with the operand type 11931/// and the current alternative constraint selected. 11932TargetLowering::ConstraintWeight 11933 X86TargetLowering::getSingleConstraintMatchWeight( 11934 AsmOperandInfo &info, const char *constraint) const { 11935 ConstraintWeight weight = CW_Invalid; 11936 Value *CallOperandVal = info.CallOperandVal; 11937 // If we don't have a value, we can't do a match, 11938 // but allow it at the lowest weight. 11939 if (CallOperandVal == NULL) 11940 return CW_Default; 11941 const Type *type = CallOperandVal->getType(); 11942 // Look at the constraint type. 11943 switch (*constraint) { 11944 default: 11945 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 11946 case 'R': 11947 case 'q': 11948 case 'Q': 11949 case 'a': 11950 case 'b': 11951 case 'c': 11952 case 'd': 11953 case 'S': 11954 case 'D': 11955 case 'A': 11956 if (CallOperandVal->getType()->isIntegerTy()) 11957 weight = CW_SpecificReg; 11958 break; 11959 case 'f': 11960 case 't': 11961 case 'u': 11962 if (type->isFloatingPointTy()) 11963 weight = CW_SpecificReg; 11964 break; 11965 case 'y': 11966 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 11967 weight = CW_SpecificReg; 11968 break; 11969 case 'x': 11970 case 'Y': 11971 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 11972 weight = CW_Register; 11973 break; 11974 case 'I': 11975 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 11976 if (C->getZExtValue() <= 31) 11977 weight = CW_Constant; 11978 } 11979 break; 11980 case 'J': 11981 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11982 if (C->getZExtValue() <= 63) 11983 weight = CW_Constant; 11984 } 11985 break; 11986 case 'K': 11987 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11988 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 11989 weight = CW_Constant; 11990 } 11991 break; 11992 case 'L': 11993 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11994 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 11995 weight = CW_Constant; 11996 } 11997 break; 11998 case 'M': 11999 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12000 if (C->getZExtValue() <= 3) 12001 weight = CW_Constant; 12002 } 12003 break; 12004 case 'N': 12005 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12006 if (C->getZExtValue() <= 0xff) 12007 weight = CW_Constant; 12008 } 12009 break; 12010 case 'G': 12011 case 'C': 12012 if (dyn_cast<ConstantFP>(CallOperandVal)) { 12013 weight = CW_Constant; 12014 } 12015 break; 12016 case 'e': 12017 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12018 if ((C->getSExtValue() >= -0x80000000LL) && 12019 (C->getSExtValue() <= 0x7fffffffLL)) 12020 weight = CW_Constant; 12021 } 12022 break; 12023 case 'Z': 12024 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12025 if (C->getZExtValue() <= 0xffffffff) 12026 weight = CW_Constant; 12027 } 12028 break; 12029 } 12030 return weight; 12031} 12032 12033/// LowerXConstraint - try to replace an X constraint, which matches anything, 12034/// with another that has more specific requirements based on the type of the 12035/// corresponding operand. 12036const char *X86TargetLowering:: 12037LowerXConstraint(EVT ConstraintVT) const { 12038 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 12039 // 'f' like normal targets. 12040 if (ConstraintVT.isFloatingPoint()) { 12041 if (Subtarget->hasXMMInt()) 12042 return "Y"; 12043 if (Subtarget->hasXMM()) 12044 return "x"; 12045 } 12046 12047 return TargetLowering::LowerXConstraint(ConstraintVT); 12048} 12049 12050/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 12051/// vector. If it is invalid, don't add anything to Ops. 12052void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 12053 char Constraint, 12054 std::vector<SDValue>&Ops, 12055 SelectionDAG &DAG) const { 12056 SDValue Result(0, 0); 12057 12058 switch (Constraint) { 12059 default: break; 12060 case 'I': 12061 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12062 if (C->getZExtValue() <= 31) { 12063 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12064 break; 12065 } 12066 } 12067 return; 12068 case 'J': 12069 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12070 if (C->getZExtValue() <= 63) { 12071 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12072 break; 12073 } 12074 } 12075 return; 12076 case 'K': 12077 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12078 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 12079 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12080 break; 12081 } 12082 } 12083 return; 12084 case 'N': 12085 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12086 if (C->getZExtValue() <= 255) { 12087 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12088 break; 12089 } 12090 } 12091 return; 12092 case 'e': { 12093 // 32-bit signed value 12094 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12095 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 12096 C->getSExtValue())) { 12097 // Widen to 64 bits here to get it sign extended. 12098 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 12099 break; 12100 } 12101 // FIXME gcc accepts some relocatable values here too, but only in certain 12102 // memory models; it's complicated. 12103 } 12104 return; 12105 } 12106 case 'Z': { 12107 // 32-bit unsigned value 12108 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12109 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 12110 C->getZExtValue())) { 12111 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12112 break; 12113 } 12114 } 12115 // FIXME gcc accepts some relocatable values here too, but only in certain 12116 // memory models; it's complicated. 12117 return; 12118 } 12119 case 'i': { 12120 // Literal immediates are always ok. 12121 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 12122 // Widen to 64 bits here to get it sign extended. 12123 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 12124 break; 12125 } 12126 12127 // In any sort of PIC mode addresses need to be computed at runtime by 12128 // adding in a register or some sort of table lookup. These can't 12129 // be used as immediates. 12130 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 12131 return; 12132 12133 // If we are in non-pic codegen mode, we allow the address of a global (with 12134 // an optional displacement) to be used with 'i'. 12135 GlobalAddressSDNode *GA = 0; 12136 int64_t Offset = 0; 12137 12138 // Match either (GA), (GA+C), (GA+C1+C2), etc. 12139 while (1) { 12140 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 12141 Offset += GA->getOffset(); 12142 break; 12143 } else if (Op.getOpcode() == ISD::ADD) { 12144 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 12145 Offset += C->getZExtValue(); 12146 Op = Op.getOperand(0); 12147 continue; 12148 } 12149 } else if (Op.getOpcode() == ISD::SUB) { 12150 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 12151 Offset += -C->getZExtValue(); 12152 Op = Op.getOperand(0); 12153 continue; 12154 } 12155 } 12156 12157 // Otherwise, this isn't something we can handle, reject it. 12158 return; 12159 } 12160 12161 const GlobalValue *GV = GA->getGlobal(); 12162 // If we require an extra load to get this address, as in PIC mode, we 12163 // can't accept it. 12164 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 12165 getTargetMachine()))) 12166 return; 12167 12168 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 12169 GA->getValueType(0), Offset); 12170 break; 12171 } 12172 } 12173 12174 if (Result.getNode()) { 12175 Ops.push_back(Result); 12176 return; 12177 } 12178 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 12179} 12180 12181std::vector<unsigned> X86TargetLowering:: 12182getRegClassForInlineAsmConstraint(const std::string &Constraint, 12183 EVT VT) const { 12184 if (Constraint.size() == 1) { 12185 // FIXME: not handling fp-stack yet! 12186 switch (Constraint[0]) { // GCC X86 Constraint Letters 12187 default: break; // Unknown constraint letter 12188 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 12189 if (Subtarget->is64Bit()) { 12190 if (VT == MVT::i32) 12191 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 12192 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 12193 X86::R10D,X86::R11D,X86::R12D, 12194 X86::R13D,X86::R14D,X86::R15D, 12195 X86::EBP, X86::ESP, 0); 12196 else if (VT == MVT::i16) 12197 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 12198 X86::SI, X86::DI, X86::R8W,X86::R9W, 12199 X86::R10W,X86::R11W,X86::R12W, 12200 X86::R13W,X86::R14W,X86::R15W, 12201 X86::BP, X86::SP, 0); 12202 else if (VT == MVT::i8) 12203 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 12204 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 12205 X86::R10B,X86::R11B,X86::R12B, 12206 X86::R13B,X86::R14B,X86::R15B, 12207 X86::BPL, X86::SPL, 0); 12208 12209 else if (VT == MVT::i64) 12210 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 12211 X86::RSI, X86::RDI, X86::R8, X86::R9, 12212 X86::R10, X86::R11, X86::R12, 12213 X86::R13, X86::R14, X86::R15, 12214 X86::RBP, X86::RSP, 0); 12215 12216 break; 12217 } 12218 // 32-bit fallthrough 12219 case 'Q': // Q_REGS 12220 if (VT == MVT::i32) 12221 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 12222 else if (VT == MVT::i16) 12223 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 12224 else if (VT == MVT::i8) 12225 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 12226 else if (VT == MVT::i64) 12227 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 12228 break; 12229 } 12230 } 12231 12232 return std::vector<unsigned>(); 12233} 12234 12235std::pair<unsigned, const TargetRegisterClass*> 12236X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 12237 EVT VT) const { 12238 // First, see if this is a constraint that directly corresponds to an LLVM 12239 // register class. 12240 if (Constraint.size() == 1) { 12241 // GCC Constraint Letters 12242 switch (Constraint[0]) { 12243 default: break; 12244 case 'r': // GENERAL_REGS 12245 case 'l': // INDEX_REGS 12246 if (VT == MVT::i8) 12247 return std::make_pair(0U, X86::GR8RegisterClass); 12248 if (VT == MVT::i16) 12249 return std::make_pair(0U, X86::GR16RegisterClass); 12250 if (VT == MVT::i32 || !Subtarget->is64Bit()) 12251 return std::make_pair(0U, X86::GR32RegisterClass); 12252 return std::make_pair(0U, X86::GR64RegisterClass); 12253 case 'R': // LEGACY_REGS 12254 if (VT == MVT::i8) 12255 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 12256 if (VT == MVT::i16) 12257 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 12258 if (VT == MVT::i32 || !Subtarget->is64Bit()) 12259 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 12260 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 12261 case 'f': // FP Stack registers. 12262 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 12263 // value to the correct fpstack register class. 12264 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 12265 return std::make_pair(0U, X86::RFP32RegisterClass); 12266 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 12267 return std::make_pair(0U, X86::RFP64RegisterClass); 12268 return std::make_pair(0U, X86::RFP80RegisterClass); 12269 case 'y': // MMX_REGS if MMX allowed. 12270 if (!Subtarget->hasMMX()) break; 12271 return std::make_pair(0U, X86::VR64RegisterClass); 12272 case 'Y': // SSE_REGS if SSE2 allowed 12273 if (!Subtarget->hasXMMInt()) break; 12274 // FALL THROUGH. 12275 case 'x': // SSE_REGS if SSE1 allowed 12276 if (!Subtarget->hasXMM()) break; 12277 12278 switch (VT.getSimpleVT().SimpleTy) { 12279 default: break; 12280 // Scalar SSE types. 12281 case MVT::f32: 12282 case MVT::i32: 12283 return std::make_pair(0U, X86::FR32RegisterClass); 12284 case MVT::f64: 12285 case MVT::i64: 12286 return std::make_pair(0U, X86::FR64RegisterClass); 12287 // Vector types. 12288 case MVT::v16i8: 12289 case MVT::v8i16: 12290 case MVT::v4i32: 12291 case MVT::v2i64: 12292 case MVT::v4f32: 12293 case MVT::v2f64: 12294 return std::make_pair(0U, X86::VR128RegisterClass); 12295 } 12296 break; 12297 } 12298 } 12299 12300 // Use the default implementation in TargetLowering to convert the register 12301 // constraint into a member of a register class. 12302 std::pair<unsigned, const TargetRegisterClass*> Res; 12303 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 12304 12305 // Not found as a standard register? 12306 if (Res.second == 0) { 12307 // Map st(0) -> st(7) -> ST0 12308 if (Constraint.size() == 7 && Constraint[0] == '{' && 12309 tolower(Constraint[1]) == 's' && 12310 tolower(Constraint[2]) == 't' && 12311 Constraint[3] == '(' && 12312 (Constraint[4] >= '0' && Constraint[4] <= '7') && 12313 Constraint[5] == ')' && 12314 Constraint[6] == '}') { 12315 12316 Res.first = X86::ST0+Constraint[4]-'0'; 12317 Res.second = X86::RFP80RegisterClass; 12318 return Res; 12319 } 12320 12321 // GCC allows "st(0)" to be called just plain "st". 12322 if (StringRef("{st}").equals_lower(Constraint)) { 12323 Res.first = X86::ST0; 12324 Res.second = X86::RFP80RegisterClass; 12325 return Res; 12326 } 12327 12328 // flags -> EFLAGS 12329 if (StringRef("{flags}").equals_lower(Constraint)) { 12330 Res.first = X86::EFLAGS; 12331 Res.second = X86::CCRRegisterClass; 12332 return Res; 12333 } 12334 12335 // 'A' means EAX + EDX. 12336 if (Constraint == "A") { 12337 Res.first = X86::EAX; 12338 Res.second = X86::GR32_ADRegisterClass; 12339 return Res; 12340 } 12341 return Res; 12342 } 12343 12344 // Otherwise, check to see if this is a register class of the wrong value 12345 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 12346 // turn into {ax},{dx}. 12347 if (Res.second->hasType(VT)) 12348 return Res; // Correct type already, nothing to do. 12349 12350 // All of the single-register GCC register classes map their values onto 12351 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 12352 // really want an 8-bit or 32-bit register, map to the appropriate register 12353 // class and return the appropriate register. 12354 if (Res.second == X86::GR16RegisterClass) { 12355 if (VT == MVT::i8) { 12356 unsigned DestReg = 0; 12357 switch (Res.first) { 12358 default: break; 12359 case X86::AX: DestReg = X86::AL; break; 12360 case X86::DX: DestReg = X86::DL; break; 12361 case X86::CX: DestReg = X86::CL; break; 12362 case X86::BX: DestReg = X86::BL; break; 12363 } 12364 if (DestReg) { 12365 Res.first = DestReg; 12366 Res.second = X86::GR8RegisterClass; 12367 } 12368 } else if (VT == MVT::i32) { 12369 unsigned DestReg = 0; 12370 switch (Res.first) { 12371 default: break; 12372 case X86::AX: DestReg = X86::EAX; break; 12373 case X86::DX: DestReg = X86::EDX; break; 12374 case X86::CX: DestReg = X86::ECX; break; 12375 case X86::BX: DestReg = X86::EBX; break; 12376 case X86::SI: DestReg = X86::ESI; break; 12377 case X86::DI: DestReg = X86::EDI; break; 12378 case X86::BP: DestReg = X86::EBP; break; 12379 case X86::SP: DestReg = X86::ESP; break; 12380 } 12381 if (DestReg) { 12382 Res.first = DestReg; 12383 Res.second = X86::GR32RegisterClass; 12384 } 12385 } else if (VT == MVT::i64) { 12386 unsigned DestReg = 0; 12387 switch (Res.first) { 12388 default: break; 12389 case X86::AX: DestReg = X86::RAX; break; 12390 case X86::DX: DestReg = X86::RDX; break; 12391 case X86::CX: DestReg = X86::RCX; break; 12392 case X86::BX: DestReg = X86::RBX; break; 12393 case X86::SI: DestReg = X86::RSI; break; 12394 case X86::DI: DestReg = X86::RDI; break; 12395 case X86::BP: DestReg = X86::RBP; break; 12396 case X86::SP: DestReg = X86::RSP; break; 12397 } 12398 if (DestReg) { 12399 Res.first = DestReg; 12400 Res.second = X86::GR64RegisterClass; 12401 } 12402 } 12403 } else if (Res.second == X86::FR32RegisterClass || 12404 Res.second == X86::FR64RegisterClass || 12405 Res.second == X86::VR128RegisterClass) { 12406 // Handle references to XMM physical registers that got mapped into the 12407 // wrong class. This can happen with constraints like {xmm0} where the 12408 // target independent register mapper will just pick the first match it can 12409 // find, ignoring the required type. 12410 if (VT == MVT::f32) 12411 Res.second = X86::FR32RegisterClass; 12412 else if (VT == MVT::f64) 12413 Res.second = X86::FR64RegisterClass; 12414 else if (X86::VR128RegisterClass->hasType(VT)) 12415 Res.second = X86::VR128RegisterClass; 12416 } 12417 12418 return Res; 12419} 12420