X86ISelLowering.cpp revision 2763538609fd455d63c192b320c73fb5d48c3e47
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86ShuffleDecode.h" 20#include "X86TargetMachine.h" 21#include "X86TargetObjectFile.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/CodeGen/PseudoSourceValue.h" 39#include "llvm/MC/MCAsmInfo.h" 40#include "llvm/MC/MCContext.h" 41#include "llvm/MC/MCExpr.h" 42#include "llvm/MC/MCSymbol.h" 43#include "llvm/ADT/BitVector.h" 44#include "llvm/ADT/SmallSet.h" 45#include "llvm/ADT/Statistic.h" 46#include "llvm/ADT/StringExtras.h" 47#include "llvm/ADT/VectorExtras.h" 48#include "llvm/Support/CommandLine.h" 49#include "llvm/Support/Debug.h" 50#include "llvm/Support/Dwarf.h" 51#include "llvm/Support/ErrorHandling.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54using namespace llvm; 55using namespace dwarf; 56 57STATISTIC(NumTailCalls, "Number of tail calls"); 58 59// Forward declarations. 60static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 61 SDValue V2); 62 63static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 64 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 65 bool is64Bit = Subtarget->is64Bit(); 66 67 if (Subtarget->isTargetEnvMacho()) { 68 if (is64Bit) 69 return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 } 72 73 if (Subtarget->isTargetELF()) { 74 if (is64Bit) 75 return new X8664_ELFTargetObjectFile(TM); 76 return new X8632_ELFTargetObjectFile(TM); 77 } 78 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 79 return new TargetLoweringObjectFileCOFF(); 80 llvm_unreachable("unknown subtarget type"); 81} 82 83X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 84 : TargetLowering(TM, createTLOF(TM)) { 85 Subtarget = &TM.getSubtarget<X86Subtarget>(); 86 X86ScalarSSEf64 = Subtarget->hasXMMInt(); 87 X86ScalarSSEf32 = Subtarget->hasXMM(); 88 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 89 90 RegInfo = TM.getRegisterInfo(); 91 TD = getTargetData(); 92 93 // Set up the TargetLowering object. 94 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 95 96 // X86 is weird, it always uses i8 for shift amounts and setcc results. 97 setShiftAmountType(MVT::i8); 98 setBooleanContents(ZeroOrOneBooleanContent); 99 setSchedulingPreference(Sched::RegPressure); 100 setStackPointerRegisterToSaveRestore(X86StackPtr); 101 102 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 103 // Setup Windows compiler runtime calls. 104 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 105 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 106 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 107 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 108 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 109 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 110 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 111 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 112 } 113 114 if (Subtarget->isTargetDarwin()) { 115 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 116 setUseUnderscoreSetJmp(false); 117 setUseUnderscoreLongJmp(false); 118 } else if (Subtarget->isTargetMingw()) { 119 // MS runtime is weird: it exports _setjmp, but longjmp! 120 setUseUnderscoreSetJmp(true); 121 setUseUnderscoreLongJmp(false); 122 } else { 123 setUseUnderscoreSetJmp(true); 124 setUseUnderscoreLongJmp(true); 125 } 126 127 // Set up the register classes. 128 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 129 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 130 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 131 if (Subtarget->is64Bit()) 132 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 133 134 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 135 136 // We don't accept any truncstore of integer registers. 137 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 138 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 139 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 140 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 141 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 142 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 143 144 // SETOEQ and SETUNE require checking two conditions. 145 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 146 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 147 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 148 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 149 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 150 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 151 152 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 153 // operation. 154 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 155 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 156 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 157 158 if (Subtarget->is64Bit()) { 159 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 160 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 161 } else if (!UseSoftFloat) { 162 // We have an algorithm for SSE2->double, and we turn this into a 163 // 64-bit FILD followed by conditional FADD for other targets. 164 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 165 // We have an algorithm for SSE2, and we turn this into a 64-bit 166 // FILD for other targets. 167 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 168 } 169 170 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 171 // this operation. 172 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 173 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 174 175 if (!UseSoftFloat) { 176 // SSE has no i16 to fp conversion, only i32 177 if (X86ScalarSSEf32) { 178 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 179 // f32 and f64 cases are Legal, f80 case is not 180 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 181 } else { 182 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 183 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 184 } 185 } else { 186 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 187 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 188 } 189 190 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 191 // are Legal, f80 is custom lowered. 192 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 193 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 194 195 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 196 // this operation. 197 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 198 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 199 200 if (X86ScalarSSEf32) { 201 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 202 // f32 and f64 cases are Legal, f80 case is not 203 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 204 } else { 205 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 206 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 207 } 208 209 // Handle FP_TO_UINT by promoting the destination to a larger signed 210 // conversion. 211 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 212 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 213 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 214 215 if (Subtarget->is64Bit()) { 216 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 217 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 218 } else if (!UseSoftFloat) { 219 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 220 // Expand FP_TO_UINT into a select. 221 // FIXME: We would like to use a Custom expander here eventually to do 222 // the optimal thing for SSE vs. the default expansion in the legalizer. 223 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 224 else 225 // With SSE3 we can use fisttpll to convert to a signed i64; without 226 // SSE, we're stuck with a fistpll. 227 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 228 } 229 230 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 231 if (!X86ScalarSSEf64) { 232 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 233 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 234 if (Subtarget->is64Bit()) { 235 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 236 // Without SSE, i64->f64 goes through memory. 237 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 238 } 239 } 240 241 // Scalar integer divide and remainder are lowered to use operations that 242 // produce two results, to match the available instructions. This exposes 243 // the two-result form to trivial CSE, which is able to combine x/y and x%y 244 // into a single instruction. 245 // 246 // Scalar integer multiply-high is also lowered to use two-result 247 // operations, to match the available instructions. However, plain multiply 248 // (low) operations are left as Legal, as there are single-result 249 // instructions for this in x86. Using the two-result multiply instructions 250 // when both high and low results are needed must be arranged by dagcombine. 251 for (unsigned i = 0, e = 4; i != e; ++i) { 252 MVT VT = IntVTs[i]; 253 setOperationAction(ISD::MULHS, VT, Expand); 254 setOperationAction(ISD::MULHU, VT, Expand); 255 setOperationAction(ISD::SDIV, VT, Expand); 256 setOperationAction(ISD::UDIV, VT, Expand); 257 setOperationAction(ISD::SREM, VT, Expand); 258 setOperationAction(ISD::UREM, VT, Expand); 259 260 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 261 setOperationAction(ISD::ADDC, VT, Custom); 262 setOperationAction(ISD::ADDE, VT, Custom); 263 setOperationAction(ISD::SUBC, VT, Custom); 264 setOperationAction(ISD::SUBE, VT, Custom); 265 } 266 267 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 268 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 269 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 270 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 271 if (Subtarget->is64Bit()) 272 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 273 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 274 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 275 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 276 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 277 setOperationAction(ISD::FREM , MVT::f32 , Expand); 278 setOperationAction(ISD::FREM , MVT::f64 , Expand); 279 setOperationAction(ISD::FREM , MVT::f80 , Expand); 280 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 281 282 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 284 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 285 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 286 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 287 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 288 if (Subtarget->is64Bit()) { 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 if (Subtarget->hasPOPCNT()) { 294 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 295 } else { 296 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 297 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 298 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 299 if (Subtarget->is64Bit()) 300 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 301 } 302 303 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 304 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 305 306 // These should be promoted to a larger select which is supported. 307 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 308 // X86 wants to expand cmov itself. 309 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 310 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 311 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 312 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 313 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 314 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 315 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 316 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 317 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 318 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 319 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 320 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 321 if (Subtarget->is64Bit()) { 322 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 323 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 324 } 325 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 326 327 // Darwin ABI issue. 328 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 329 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 330 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 331 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 332 if (Subtarget->is64Bit()) 333 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 334 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 335 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 336 if (Subtarget->is64Bit()) { 337 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 338 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 339 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 340 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 341 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 342 } 343 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 344 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 345 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 346 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 347 if (Subtarget->is64Bit()) { 348 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 349 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 350 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 351 } 352 353 if (Subtarget->hasXMM()) 354 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 355 356 // We may not have a libcall for MEMBARRIER so we should lower this. 357 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 358 359 // On X86 and X86-64, atomic operations are lowered to locked instructions. 360 // Locked instructions, in turn, have implicit fence semantics (all memory 361 // operations are flushed before issuing the locked instruction, and they 362 // are not buffered), so we can fold away the common pattern of 363 // fence-atomic-fence. 364 setShouldFoldAtomicFences(true); 365 366 // Expand certain atomics 367 for (unsigned i = 0, e = 4; i != e; ++i) { 368 MVT VT = IntVTs[i]; 369 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 371 } 372 373 if (!Subtarget->is64Bit()) { 374 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 375 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 376 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 377 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 378 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 379 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 380 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 381 } 382 383 // FIXME - use subtarget debug flags 384 if (!Subtarget->isTargetDarwin() && 385 !Subtarget->isTargetELF() && 386 !Subtarget->isTargetCygMing()) { 387 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 388 } 389 390 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 391 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 392 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 393 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 394 if (Subtarget->is64Bit()) { 395 setExceptionPointerRegister(X86::RAX); 396 setExceptionSelectorRegister(X86::RDX); 397 } else { 398 setExceptionPointerRegister(X86::EAX); 399 setExceptionSelectorRegister(X86::EDX); 400 } 401 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 402 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 403 404 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 405 406 setOperationAction(ISD::TRAP, MVT::Other, Legal); 407 408 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 409 setOperationAction(ISD::VASTART , MVT::Other, Custom); 410 setOperationAction(ISD::VAEND , MVT::Other, Expand); 411 if (Subtarget->is64Bit()) { 412 setOperationAction(ISD::VAARG , MVT::Other, Custom); 413 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 414 } else { 415 setOperationAction(ISD::VAARG , MVT::Other, Expand); 416 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 417 } 418 419 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 420 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 421 if (Subtarget->is64Bit()) 422 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 423 if (Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) 424 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 425 else 426 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 427 428 if (!UseSoftFloat && X86ScalarSSEf64) { 429 // f32 and f64 use SSE. 430 // Set up the FP register classes. 431 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 432 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 433 434 // Use ANDPD to simulate FABS. 435 setOperationAction(ISD::FABS , MVT::f64, Custom); 436 setOperationAction(ISD::FABS , MVT::f32, Custom); 437 438 // Use XORP to simulate FNEG. 439 setOperationAction(ISD::FNEG , MVT::f64, Custom); 440 setOperationAction(ISD::FNEG , MVT::f32, Custom); 441 442 // Use ANDPD and ORPD to simulate FCOPYSIGN. 443 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 444 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 445 446 // We don't support sin/cos/fmod 447 setOperationAction(ISD::FSIN , MVT::f64, Expand); 448 setOperationAction(ISD::FCOS , MVT::f64, Expand); 449 setOperationAction(ISD::FSIN , MVT::f32, Expand); 450 setOperationAction(ISD::FCOS , MVT::f32, Expand); 451 452 // Expand FP immediates into loads from the stack, except for the special 453 // cases we handle. 454 addLegalFPImmediate(APFloat(+0.0)); // xorpd 455 addLegalFPImmediate(APFloat(+0.0f)); // xorps 456 } else if (!UseSoftFloat && X86ScalarSSEf32) { 457 // Use SSE for f32, x87 for f64. 458 // Set up the FP register classes. 459 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 460 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 461 462 // Use ANDPS to simulate FABS. 463 setOperationAction(ISD::FABS , MVT::f32, Custom); 464 465 // Use XORP to simulate FNEG. 466 setOperationAction(ISD::FNEG , MVT::f32, Custom); 467 468 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 469 470 // Use ANDPS and ORPS to simulate FCOPYSIGN. 471 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 472 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 473 474 // We don't support sin/cos/fmod 475 setOperationAction(ISD::FSIN , MVT::f32, Expand); 476 setOperationAction(ISD::FCOS , MVT::f32, Expand); 477 478 // Special cases we handle for FP constants. 479 addLegalFPImmediate(APFloat(+0.0f)); // xorps 480 addLegalFPImmediate(APFloat(+0.0)); // FLD0 481 addLegalFPImmediate(APFloat(+1.0)); // FLD1 482 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 483 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 484 485 if (!UnsafeFPMath) { 486 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 487 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 488 } 489 } else if (!UseSoftFloat) { 490 // f32 and f64 in x87. 491 // Set up the FP register classes. 492 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 493 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 494 495 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 496 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 497 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 498 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 499 500 if (!UnsafeFPMath) { 501 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 502 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 503 } 504 addLegalFPImmediate(APFloat(+0.0)); // FLD0 505 addLegalFPImmediate(APFloat(+1.0)); // FLD1 506 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 507 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 508 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 509 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 510 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 511 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 512 } 513 514 // Long double always uses X87. 515 if (!UseSoftFloat) { 516 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 517 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 518 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 519 { 520 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 521 addLegalFPImmediate(TmpFlt); // FLD0 522 TmpFlt.changeSign(); 523 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 524 525 bool ignored; 526 APFloat TmpFlt2(+1.0); 527 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 528 &ignored); 529 addLegalFPImmediate(TmpFlt2); // FLD1 530 TmpFlt2.changeSign(); 531 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 532 } 533 534 if (!UnsafeFPMath) { 535 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 536 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 537 } 538 } 539 540 // Always use a library call for pow. 541 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 542 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 543 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 544 545 setOperationAction(ISD::FLOG, MVT::f80, Expand); 546 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 547 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 548 setOperationAction(ISD::FEXP, MVT::f80, Expand); 549 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 550 551 // First set operation action for all vector types to either promote 552 // (for widening) or expand (for scalarization). Then we will selectively 553 // turn on ones that can be effectively codegen'd. 554 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 555 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 556 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 571 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 573 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 574 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 604 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 605 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 606 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 607 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 608 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 609 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 610 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 611 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 612 setTruncStoreAction((MVT::SimpleValueType)VT, 613 (MVT::SimpleValueType)InnerVT, Expand); 614 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 615 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 616 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 617 } 618 619 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 620 // with -msoft-float, disable use of MMX as well. 621 if (!UseSoftFloat && Subtarget->hasMMX()) { 622 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 623 // No operations on x86mmx supported, everything uses intrinsics. 624 } 625 626 // MMX-sized vectors (other than x86mmx) are expected to be expanded 627 // into smaller operations. 628 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 629 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 630 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 631 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 632 setOperationAction(ISD::AND, MVT::v8i8, Expand); 633 setOperationAction(ISD::AND, MVT::v4i16, Expand); 634 setOperationAction(ISD::AND, MVT::v2i32, Expand); 635 setOperationAction(ISD::AND, MVT::v1i64, Expand); 636 setOperationAction(ISD::OR, MVT::v8i8, Expand); 637 setOperationAction(ISD::OR, MVT::v4i16, Expand); 638 setOperationAction(ISD::OR, MVT::v2i32, Expand); 639 setOperationAction(ISD::OR, MVT::v1i64, Expand); 640 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 641 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 642 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 643 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 644 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 645 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 646 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 647 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 648 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 649 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 650 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 651 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 652 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 653 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 654 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 655 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 656 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 657 658 if (!UseSoftFloat && Subtarget->hasXMM()) { 659 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 660 661 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 662 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 663 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 664 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 665 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 666 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 667 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 669 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 670 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 671 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 672 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 673 } 674 675 if (!UseSoftFloat && Subtarget->hasXMMInt()) { 676 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 677 678 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 679 // registers cannot be used even for integer operations. 680 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 681 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 682 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 683 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 684 685 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 686 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 687 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 688 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 689 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 690 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 691 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 692 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 693 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 694 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 695 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 696 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 697 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 698 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 699 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 700 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 701 702 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 703 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 704 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 705 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 706 707 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 708 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 709 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 710 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 711 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 712 713 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 714 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 715 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 716 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 717 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 718 719 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 720 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 721 EVT VT = (MVT::SimpleValueType)i; 722 // Do not attempt to custom lower non-power-of-2 vectors 723 if (!isPowerOf2_32(VT.getVectorNumElements())) 724 continue; 725 // Do not attempt to custom lower non-128-bit vectors 726 if (!VT.is128BitVector()) 727 continue; 728 setOperationAction(ISD::BUILD_VECTOR, 729 VT.getSimpleVT().SimpleTy, Custom); 730 setOperationAction(ISD::VECTOR_SHUFFLE, 731 VT.getSimpleVT().SimpleTy, Custom); 732 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 733 VT.getSimpleVT().SimpleTy, Custom); 734 } 735 736 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 737 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 738 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 739 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 740 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 741 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 742 743 if (Subtarget->is64Bit()) { 744 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 745 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 746 } 747 748 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 749 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 750 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 751 EVT VT = SVT; 752 753 // Do not attempt to promote non-128-bit vectors 754 if (!VT.is128BitVector()) 755 continue; 756 757 setOperationAction(ISD::AND, SVT, Promote); 758 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 759 setOperationAction(ISD::OR, SVT, Promote); 760 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 761 setOperationAction(ISD::XOR, SVT, Promote); 762 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 763 setOperationAction(ISD::LOAD, SVT, Promote); 764 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 765 setOperationAction(ISD::SELECT, SVT, Promote); 766 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 767 } 768 769 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 770 771 // Custom lower v2i64 and v2f64 selects. 772 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 773 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 774 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 775 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 776 777 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 778 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 779 } 780 781 if (Subtarget->hasSSE41()) { 782 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 783 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 784 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 785 setOperationAction(ISD::FRINT, MVT::f32, Legal); 786 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 787 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 788 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 789 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 790 setOperationAction(ISD::FRINT, MVT::f64, Legal); 791 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 792 793 // FIXME: Do we need to handle scalar-to-vector here? 794 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 795 796 // Can turn SHL into an integer multiply. 797 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 798 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 799 800 // i8 and i16 vectors are custom , because the source register and source 801 // source memory operand types are not the same width. f32 vectors are 802 // custom since the immediate controlling the insert encodes additional 803 // information. 804 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 805 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 806 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 807 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 808 809 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 810 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 811 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 812 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 813 814 if (Subtarget->is64Bit()) { 815 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 816 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 817 } 818 } 819 820 if (Subtarget->hasSSE42()) 821 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 822 823 if (!UseSoftFloat && Subtarget->hasAVX()) { 824 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 825 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 826 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 827 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 828 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 829 830 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 831 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 832 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 833 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 834 835 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 836 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 837 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 838 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 839 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 840 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 841 842 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 843 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 844 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 845 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 846 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 847 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 848 849 // Custom lower build_vector, vector_shuffle, scalar_to_vector, 850 // insert_vector_elt extract_subvector and extract_vector_elt for 851 // 256-bit types. 852 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 853 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; 854 ++i) { 855 MVT::SimpleValueType VT = (MVT::SimpleValueType)i; 856 // Do not attempt to custom lower non-256-bit vectors 857 if (!isPowerOf2_32(MVT(VT).getVectorNumElements()) 858 || (MVT(VT).getSizeInBits() < 256)) 859 continue; 860 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 861 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 862 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 863 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 864 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 865 } 866 // Custom-lower insert_subvector and extract_subvector based on 867 // the result type. 868 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 869 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; 870 ++i) { 871 MVT::SimpleValueType VT = (MVT::SimpleValueType)i; 872 // Do not attempt to custom lower non-256-bit vectors 873 if (!isPowerOf2_32(MVT(VT).getVectorNumElements())) 874 continue; 875 876 if (MVT(VT).getSizeInBits() == 128) { 877 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 878 } 879 else if (MVT(VT).getSizeInBits() == 256) { 880 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 881 } 882 } 883 884 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 885 // Don't promote loads because we need them for VPERM vector index versions. 886 887 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 888 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; 889 VT++) { 890 if (!isPowerOf2_32(MVT((MVT::SimpleValueType)VT).getVectorNumElements()) 891 || (MVT((MVT::SimpleValueType)VT).getSizeInBits() < 256)) 892 continue; 893 setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote); 894 AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v4i64); 895 setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote); 896 AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v4i64); 897 setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote); 898 AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v4i64); 899 //setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote); 900 //AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v4i64); 901 setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote); 902 AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v4i64); 903 } 904 } 905 906 // We want to custom lower some of our intrinsics. 907 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 908 909 910 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 911 // handle type legalization for these operations here. 912 // 913 // FIXME: We really should do custom legalization for addition and 914 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 915 // than generic legalization for 64-bit multiplication-with-overflow, though. 916 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 917 // Add/Sub/Mul with overflow operations are custom lowered. 918 MVT VT = IntVTs[i]; 919 setOperationAction(ISD::SADDO, VT, Custom); 920 setOperationAction(ISD::UADDO, VT, Custom); 921 setOperationAction(ISD::SSUBO, VT, Custom); 922 setOperationAction(ISD::USUBO, VT, Custom); 923 setOperationAction(ISD::SMULO, VT, Custom); 924 setOperationAction(ISD::UMULO, VT, Custom); 925 } 926 927 // There are no 8-bit 3-address imul/mul instructions 928 setOperationAction(ISD::SMULO, MVT::i8, Expand); 929 setOperationAction(ISD::UMULO, MVT::i8, Expand); 930 931 if (!Subtarget->is64Bit()) { 932 // These libcalls are not available in 32-bit. 933 setLibcallName(RTLIB::SHL_I128, 0); 934 setLibcallName(RTLIB::SRL_I128, 0); 935 setLibcallName(RTLIB::SRA_I128, 0); 936 } 937 938 // We have target-specific dag combine patterns for the following nodes: 939 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 940 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 941 setTargetDAGCombine(ISD::BUILD_VECTOR); 942 setTargetDAGCombine(ISD::SELECT); 943 setTargetDAGCombine(ISD::SHL); 944 setTargetDAGCombine(ISD::SRA); 945 setTargetDAGCombine(ISD::SRL); 946 setTargetDAGCombine(ISD::OR); 947 setTargetDAGCombine(ISD::AND); 948 setTargetDAGCombine(ISD::ADD); 949 setTargetDAGCombine(ISD::SUB); 950 setTargetDAGCombine(ISD::STORE); 951 setTargetDAGCombine(ISD::ZERO_EXTEND); 952 if (Subtarget->is64Bit()) 953 setTargetDAGCombine(ISD::MUL); 954 955 computeRegisterProperties(); 956 957 // On Darwin, -Os means optimize for size without hurting performance, 958 // do not reduce the limit. 959 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 960 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 961 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 962 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 963 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 964 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 965 setPrefLoopAlignment(16); 966 benefitFromCodePlacementOpt = true; 967} 968 969 970MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 971 return MVT::i8; 972} 973 974 975/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 976/// the desired ByVal argument alignment. 977static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 978 if (MaxAlign == 16) 979 return; 980 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 981 if (VTy->getBitWidth() == 128) 982 MaxAlign = 16; 983 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 984 unsigned EltAlign = 0; 985 getMaxByValAlign(ATy->getElementType(), EltAlign); 986 if (EltAlign > MaxAlign) 987 MaxAlign = EltAlign; 988 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 989 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 990 unsigned EltAlign = 0; 991 getMaxByValAlign(STy->getElementType(i), EltAlign); 992 if (EltAlign > MaxAlign) 993 MaxAlign = EltAlign; 994 if (MaxAlign == 16) 995 break; 996 } 997 } 998 return; 999} 1000 1001/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1002/// function arguments in the caller parameter area. For X86, aggregates 1003/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1004/// are at 4-byte boundaries. 1005unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1006 if (Subtarget->is64Bit()) { 1007 // Max of 8 and alignment of type. 1008 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1009 if (TyAlign > 8) 1010 return TyAlign; 1011 return 8; 1012 } 1013 1014 unsigned Align = 4; 1015 if (Subtarget->hasXMM()) 1016 getMaxByValAlign(Ty, Align); 1017 return Align; 1018} 1019 1020/// getOptimalMemOpType - Returns the target specific optimal type for load 1021/// and store operations as a result of memset, memcpy, and memmove 1022/// lowering. If DstAlign is zero that means it's safe to destination 1023/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1024/// means there isn't a need to check it against alignment requirement, 1025/// probably because the source does not need to be loaded. If 1026/// 'NonScalarIntSafe' is true, that means it's safe to return a 1027/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1028/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1029/// constant so it does not need to be loaded. 1030/// It returns EVT::Other if the type should be determined using generic 1031/// target-independent logic. 1032EVT 1033X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1034 unsigned DstAlign, unsigned SrcAlign, 1035 bool NonScalarIntSafe, 1036 bool MemcpyStrSrc, 1037 MachineFunction &MF) const { 1038 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1039 // linux. This is because the stack realignment code can't handle certain 1040 // cases like PR2962. This should be removed when PR2962 is fixed. 1041 const Function *F = MF.getFunction(); 1042 if (NonScalarIntSafe && 1043 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1044 if (Size >= 16 && 1045 (Subtarget->isUnalignedMemAccessFast() || 1046 ((DstAlign == 0 || DstAlign >= 16) && 1047 (SrcAlign == 0 || SrcAlign >= 16))) && 1048 Subtarget->getStackAlignment() >= 16) { 1049 if (Subtarget->hasSSE2()) 1050 return MVT::v4i32; 1051 if (Subtarget->hasSSE1()) 1052 return MVT::v4f32; 1053 } else if (!MemcpyStrSrc && Size >= 8 && 1054 !Subtarget->is64Bit() && 1055 Subtarget->getStackAlignment() >= 8 && 1056 Subtarget->hasXMMInt()) { 1057 // Do not use f64 to lower memcpy if source is string constant. It's 1058 // better to use i32 to avoid the loads. 1059 return MVT::f64; 1060 } 1061 } 1062 if (Subtarget->is64Bit() && Size >= 8) 1063 return MVT::i64; 1064 return MVT::i32; 1065} 1066 1067/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1068/// current function. The returned value is a member of the 1069/// MachineJumpTableInfo::JTEntryKind enum. 1070unsigned X86TargetLowering::getJumpTableEncoding() const { 1071 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1072 // symbol. 1073 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1074 Subtarget->isPICStyleGOT()) 1075 return MachineJumpTableInfo::EK_Custom32; 1076 1077 // Otherwise, use the normal jump table encoding heuristics. 1078 return TargetLowering::getJumpTableEncoding(); 1079} 1080 1081const MCExpr * 1082X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1083 const MachineBasicBlock *MBB, 1084 unsigned uid,MCContext &Ctx) const{ 1085 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1086 Subtarget->isPICStyleGOT()); 1087 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1088 // entries. 1089 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1090 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1091} 1092 1093/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1094/// jumptable. 1095SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1096 SelectionDAG &DAG) const { 1097 if (!Subtarget->is64Bit()) 1098 // This doesn't have DebugLoc associated with it, but is not really the 1099 // same as a Register. 1100 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1101 return Table; 1102} 1103 1104/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1105/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1106/// MCExpr. 1107const MCExpr *X86TargetLowering:: 1108getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1109 MCContext &Ctx) const { 1110 // X86-64 uses RIP relative addressing based on the jump table label. 1111 if (Subtarget->isPICStyleRIPRel()) 1112 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1113 1114 // Otherwise, the reference is relative to the PIC base. 1115 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1116} 1117 1118/// getFunctionAlignment - Return the Log2 alignment of this function. 1119unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1120 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1121} 1122 1123// FIXME: Why this routine is here? Move to RegInfo! 1124std::pair<const TargetRegisterClass*, uint8_t> 1125X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1126 const TargetRegisterClass *RRC = 0; 1127 uint8_t Cost = 1; 1128 switch (VT.getSimpleVT().SimpleTy) { 1129 default: 1130 return TargetLowering::findRepresentativeClass(VT); 1131 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1132 RRC = (Subtarget->is64Bit() 1133 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1134 break; 1135 case MVT::x86mmx: 1136 RRC = X86::VR64RegisterClass; 1137 break; 1138 case MVT::f32: case MVT::f64: 1139 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1140 case MVT::v4f32: case MVT::v2f64: 1141 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1142 case MVT::v4f64: 1143 RRC = X86::VR128RegisterClass; 1144 break; 1145 } 1146 return std::make_pair(RRC, Cost); 1147} 1148 1149// FIXME: Why this routine is here? Move to RegInfo! 1150unsigned 1151X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1152 MachineFunction &MF) const { 1153 const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); 1154 1155 unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; 1156 switch (RC->getID()) { 1157 default: 1158 return 0; 1159 case X86::GR32RegClassID: 1160 return 4 - FPDiff; 1161 case X86::GR64RegClassID: 1162 return 8 - FPDiff; 1163 case X86::VR128RegClassID: 1164 return Subtarget->is64Bit() ? 10 : 4; 1165 case X86::VR64RegClassID: 1166 return 4; 1167 } 1168} 1169 1170bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1171 unsigned &Offset) const { 1172 if (!Subtarget->isTargetLinux()) 1173 return false; 1174 1175 if (Subtarget->is64Bit()) { 1176 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1177 Offset = 0x28; 1178 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1179 AddressSpace = 256; 1180 else 1181 AddressSpace = 257; 1182 } else { 1183 // %gs:0x14 on i386 1184 Offset = 0x14; 1185 AddressSpace = 256; 1186 } 1187 return true; 1188} 1189 1190 1191//===----------------------------------------------------------------------===// 1192// Return Value Calling Convention Implementation 1193//===----------------------------------------------------------------------===// 1194 1195#include "X86GenCallingConv.inc" 1196 1197bool 1198X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1199 const SmallVectorImpl<ISD::OutputArg> &Outs, 1200 LLVMContext &Context) const { 1201 SmallVector<CCValAssign, 16> RVLocs; 1202 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1203 RVLocs, Context); 1204 return CCInfo.CheckReturn(Outs, RetCC_X86); 1205} 1206 1207SDValue 1208X86TargetLowering::LowerReturn(SDValue Chain, 1209 CallingConv::ID CallConv, bool isVarArg, 1210 const SmallVectorImpl<ISD::OutputArg> &Outs, 1211 const SmallVectorImpl<SDValue> &OutVals, 1212 DebugLoc dl, SelectionDAG &DAG) const { 1213 MachineFunction &MF = DAG.getMachineFunction(); 1214 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1215 1216 SmallVector<CCValAssign, 16> RVLocs; 1217 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1218 RVLocs, *DAG.getContext()); 1219 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1220 1221 // Add the regs to the liveout set for the function. 1222 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1223 for (unsigned i = 0; i != RVLocs.size(); ++i) 1224 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1225 MRI.addLiveOut(RVLocs[i].getLocReg()); 1226 1227 SDValue Flag; 1228 1229 SmallVector<SDValue, 6> RetOps; 1230 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1231 // Operand #1 = Bytes To Pop 1232 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1233 MVT::i16)); 1234 1235 // Copy the result values into the output registers. 1236 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1237 CCValAssign &VA = RVLocs[i]; 1238 assert(VA.isRegLoc() && "Can only return in registers!"); 1239 SDValue ValToCopy = OutVals[i]; 1240 EVT ValVT = ValToCopy.getValueType(); 1241 1242 // If this is x86-64, and we disabled SSE, we can't return FP values, 1243 // or SSE or MMX vectors. 1244 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1245 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1246 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1247 report_fatal_error("SSE register return with SSE disabled"); 1248 } 1249 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1250 // llvm-gcc has never done it right and no one has noticed, so this 1251 // should be OK for now. 1252 if (ValVT == MVT::f64 && 1253 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1254 report_fatal_error("SSE2 register return with SSE2 disabled"); 1255 1256 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1257 // the RET instruction and handled by the FP Stackifier. 1258 if (VA.getLocReg() == X86::ST0 || 1259 VA.getLocReg() == X86::ST1) { 1260 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1261 // change the value to the FP stack register class. 1262 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1263 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1264 RetOps.push_back(ValToCopy); 1265 // Don't emit a copytoreg. 1266 continue; 1267 } 1268 1269 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1270 // which is returned in RAX / RDX. 1271 if (Subtarget->is64Bit()) { 1272 if (ValVT == MVT::x86mmx) { 1273 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1274 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1275 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1276 ValToCopy); 1277 // If we don't have SSE2 available, convert to v4f32 so the generated 1278 // register is legal. 1279 if (!Subtarget->hasSSE2()) 1280 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1281 } 1282 } 1283 } 1284 1285 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1286 Flag = Chain.getValue(1); 1287 } 1288 1289 // The x86-64 ABI for returning structs by value requires that we copy 1290 // the sret argument into %rax for the return. We saved the argument into 1291 // a virtual register in the entry block, so now we copy the value out 1292 // and into %rax. 1293 if (Subtarget->is64Bit() && 1294 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1295 MachineFunction &MF = DAG.getMachineFunction(); 1296 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1297 unsigned Reg = FuncInfo->getSRetReturnReg(); 1298 assert(Reg && 1299 "SRetReturnReg should have been set in LowerFormalArguments()."); 1300 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1301 1302 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1303 Flag = Chain.getValue(1); 1304 1305 // RAX now acts like a return value. 1306 MRI.addLiveOut(X86::RAX); 1307 } 1308 1309 RetOps[0] = Chain; // Update chain. 1310 1311 // Add the flag if we have it. 1312 if (Flag.getNode()) 1313 RetOps.push_back(Flag); 1314 1315 return DAG.getNode(X86ISD::RET_FLAG, dl, 1316 MVT::Other, &RetOps[0], RetOps.size()); 1317} 1318 1319bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1320 if (N->getNumValues() != 1) 1321 return false; 1322 if (!N->hasNUsesOfValue(1, 0)) 1323 return false; 1324 1325 SDNode *Copy = *N->use_begin(); 1326 if (Copy->getOpcode() != ISD::CopyToReg && 1327 Copy->getOpcode() != ISD::FP_EXTEND) 1328 return false; 1329 1330 bool HasRet = false; 1331 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1332 UI != UE; ++UI) { 1333 if (UI->getOpcode() != X86ISD::RET_FLAG) 1334 return false; 1335 HasRet = true; 1336 } 1337 1338 return HasRet; 1339} 1340 1341/// LowerCallResult - Lower the result values of a call into the 1342/// appropriate copies out of appropriate physical registers. 1343/// 1344SDValue 1345X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1346 CallingConv::ID CallConv, bool isVarArg, 1347 const SmallVectorImpl<ISD::InputArg> &Ins, 1348 DebugLoc dl, SelectionDAG &DAG, 1349 SmallVectorImpl<SDValue> &InVals) const { 1350 1351 // Assign locations to each value returned by this call. 1352 SmallVector<CCValAssign, 16> RVLocs; 1353 bool Is64Bit = Subtarget->is64Bit(); 1354 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1355 RVLocs, *DAG.getContext()); 1356 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1357 1358 // Copy all of the result registers out of their specified physreg. 1359 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1360 CCValAssign &VA = RVLocs[i]; 1361 EVT CopyVT = VA.getValVT(); 1362 1363 // If this is x86-64, and we disabled SSE, we can't return FP values 1364 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1365 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1366 report_fatal_error("SSE register return with SSE disabled"); 1367 } 1368 1369 SDValue Val; 1370 1371 // If this is a call to a function that returns an fp value on the floating 1372 // point stack, we must guarantee the the value is popped from the stack, so 1373 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1374 // if the return value is not used. We use the FpGET_ST0 instructions 1375 // instead. 1376 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1377 // If we prefer to use the value in xmm registers, copy it out as f80 and 1378 // use a truncate to move it from fp stack reg to xmm reg. 1379 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1380 bool isST0 = VA.getLocReg() == X86::ST0; 1381 unsigned Opc = 0; 1382 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1383 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1384 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1385 SDValue Ops[] = { Chain, InFlag }; 1386 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Glue, 1387 Ops, 2), 1); 1388 Val = Chain.getValue(0); 1389 1390 // Round the f80 to the right size, which also moves it to the appropriate 1391 // xmm register. 1392 if (CopyVT != VA.getValVT()) 1393 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1394 // This truncation won't change the value. 1395 DAG.getIntPtrConstant(1)); 1396 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1397 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1398 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1399 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1400 MVT::v2i64, InFlag).getValue(1); 1401 Val = Chain.getValue(0); 1402 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1403 Val, DAG.getConstant(0, MVT::i64)); 1404 } else { 1405 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1406 MVT::i64, InFlag).getValue(1); 1407 Val = Chain.getValue(0); 1408 } 1409 Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val); 1410 } else { 1411 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1412 CopyVT, InFlag).getValue(1); 1413 Val = Chain.getValue(0); 1414 } 1415 InFlag = Chain.getValue(2); 1416 InVals.push_back(Val); 1417 } 1418 1419 return Chain; 1420} 1421 1422 1423//===----------------------------------------------------------------------===// 1424// C & StdCall & Fast Calling Convention implementation 1425//===----------------------------------------------------------------------===// 1426// StdCall calling convention seems to be standard for many Windows' API 1427// routines and around. It differs from C calling convention just a little: 1428// callee should clean up the stack, not caller. Symbols should be also 1429// decorated in some fancy way :) It doesn't support any vector arguments. 1430// For info on fast calling convention see Fast Calling Convention (tail call) 1431// implementation LowerX86_32FastCCCallTo. 1432 1433/// CallIsStructReturn - Determines whether a call uses struct return 1434/// semantics. 1435static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1436 if (Outs.empty()) 1437 return false; 1438 1439 return Outs[0].Flags.isSRet(); 1440} 1441 1442/// ArgsAreStructReturn - Determines whether a function uses struct 1443/// return semantics. 1444static bool 1445ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1446 if (Ins.empty()) 1447 return false; 1448 1449 return Ins[0].Flags.isSRet(); 1450} 1451 1452/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1453/// by "Src" to address "Dst" with size and alignment information specified by 1454/// the specific parameter attribute. The copy will be passed as a byval 1455/// function parameter. 1456static SDValue 1457CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1458 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1459 DebugLoc dl) { 1460 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1461 1462 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1463 /*isVolatile*/false, /*AlwaysInline=*/true, 1464 MachinePointerInfo(), MachinePointerInfo()); 1465} 1466 1467/// IsTailCallConvention - Return true if the calling convention is one that 1468/// supports tail call optimization. 1469static bool IsTailCallConvention(CallingConv::ID CC) { 1470 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1471} 1472 1473/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1474/// a tailcall target by changing its ABI. 1475static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1476 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1477} 1478 1479SDValue 1480X86TargetLowering::LowerMemArgument(SDValue Chain, 1481 CallingConv::ID CallConv, 1482 const SmallVectorImpl<ISD::InputArg> &Ins, 1483 DebugLoc dl, SelectionDAG &DAG, 1484 const CCValAssign &VA, 1485 MachineFrameInfo *MFI, 1486 unsigned i) const { 1487 // Create the nodes corresponding to a load from this parameter slot. 1488 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1489 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1490 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1491 EVT ValVT; 1492 1493 // If value is passed by pointer we have address passed instead of the value 1494 // itself. 1495 if (VA.getLocInfo() == CCValAssign::Indirect) 1496 ValVT = VA.getLocVT(); 1497 else 1498 ValVT = VA.getValVT(); 1499 1500 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1501 // changed with more analysis. 1502 // In case of tail call optimization mark all arguments mutable. Since they 1503 // could be overwritten by lowering of arguments in case of a tail call. 1504 if (Flags.isByVal()) { 1505 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1506 VA.getLocMemOffset(), isImmutable); 1507 return DAG.getFrameIndex(FI, getPointerTy()); 1508 } else { 1509 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1510 VA.getLocMemOffset(), isImmutable); 1511 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1512 return DAG.getLoad(ValVT, dl, Chain, FIN, 1513 MachinePointerInfo::getFixedStack(FI), 1514 false, false, 0); 1515 } 1516} 1517 1518SDValue 1519X86TargetLowering::LowerFormalArguments(SDValue Chain, 1520 CallingConv::ID CallConv, 1521 bool isVarArg, 1522 const SmallVectorImpl<ISD::InputArg> &Ins, 1523 DebugLoc dl, 1524 SelectionDAG &DAG, 1525 SmallVectorImpl<SDValue> &InVals) 1526 const { 1527 MachineFunction &MF = DAG.getMachineFunction(); 1528 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1529 1530 const Function* Fn = MF.getFunction(); 1531 if (Fn->hasExternalLinkage() && 1532 Subtarget->isTargetCygMing() && 1533 Fn->getName() == "main") 1534 FuncInfo->setForceFramePointer(true); 1535 1536 MachineFrameInfo *MFI = MF.getFrameInfo(); 1537 bool Is64Bit = Subtarget->is64Bit(); 1538 bool IsWin64 = Subtarget->isTargetWin64(); 1539 1540 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1541 "Var args not supported with calling convention fastcc or ghc"); 1542 1543 // Assign locations to all of the incoming arguments. 1544 SmallVector<CCValAssign, 16> ArgLocs; 1545 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1546 ArgLocs, *DAG.getContext()); 1547 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1548 1549 unsigned LastVal = ~0U; 1550 SDValue ArgValue; 1551 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1552 CCValAssign &VA = ArgLocs[i]; 1553 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1554 // places. 1555 assert(VA.getValNo() != LastVal && 1556 "Don't support value assigned to multiple locs yet"); 1557 LastVal = VA.getValNo(); 1558 1559 if (VA.isRegLoc()) { 1560 EVT RegVT = VA.getLocVT(); 1561 TargetRegisterClass *RC = NULL; 1562 if (RegVT == MVT::i32) 1563 RC = X86::GR32RegisterClass; 1564 else if (Is64Bit && RegVT == MVT::i64) 1565 RC = X86::GR64RegisterClass; 1566 else if (RegVT == MVT::f32) 1567 RC = X86::FR32RegisterClass; 1568 else if (RegVT == MVT::f64) 1569 RC = X86::FR64RegisterClass; 1570 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1571 RC = X86::VR256RegisterClass; 1572 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1573 RC = X86::VR128RegisterClass; 1574 else if (RegVT == MVT::x86mmx) 1575 RC = X86::VR64RegisterClass; 1576 else 1577 llvm_unreachable("Unknown argument type!"); 1578 1579 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC, dl); 1580 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1581 1582 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1583 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1584 // right size. 1585 if (VA.getLocInfo() == CCValAssign::SExt) 1586 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1587 DAG.getValueType(VA.getValVT())); 1588 else if (VA.getLocInfo() == CCValAssign::ZExt) 1589 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1590 DAG.getValueType(VA.getValVT())); 1591 else if (VA.getLocInfo() == CCValAssign::BCvt) 1592 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1593 1594 if (VA.isExtInLoc()) { 1595 // Handle MMX values passed in XMM regs. 1596 if (RegVT.isVector()) { 1597 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1598 ArgValue); 1599 } else 1600 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1601 } 1602 } else { 1603 assert(VA.isMemLoc()); 1604 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1605 } 1606 1607 // If value is passed via pointer - do a load. 1608 if (VA.getLocInfo() == CCValAssign::Indirect) 1609 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1610 MachinePointerInfo(), false, false, 0); 1611 1612 InVals.push_back(ArgValue); 1613 } 1614 1615 // The x86-64 ABI for returning structs by value requires that we copy 1616 // the sret argument into %rax for the return. Save the argument into 1617 // a virtual register so that we can access it from the return points. 1618 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1619 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1620 unsigned Reg = FuncInfo->getSRetReturnReg(); 1621 if (!Reg) { 1622 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1623 FuncInfo->setSRetReturnReg(Reg); 1624 } 1625 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1626 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1627 } 1628 1629 unsigned StackSize = CCInfo.getNextStackOffset(); 1630 // Align stack specially for tail calls. 1631 if (FuncIsMadeTailCallSafe(CallConv)) 1632 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1633 1634 // If the function takes variable number of arguments, make a frame index for 1635 // the start of the first vararg value... for expansion of llvm.va_start. 1636 if (isVarArg) { 1637 if (!IsWin64 && (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1638 CallConv != CallingConv::X86_ThisCall))) { 1639 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1640 } 1641 if (Is64Bit) { 1642 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1643 1644 // FIXME: We should really autogenerate these arrays 1645 static const unsigned GPR64ArgRegsWin64[] = { 1646 X86::RCX, X86::RDX, X86::R8, X86::R9 1647 }; 1648 static const unsigned GPR64ArgRegs64Bit[] = { 1649 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1650 }; 1651 static const unsigned XMMArgRegs64Bit[] = { 1652 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1653 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1654 }; 1655 const unsigned *GPR64ArgRegs; 1656 unsigned NumXMMRegs = 0; 1657 1658 if (IsWin64) { 1659 // The XMM registers which might contain var arg parameters are shadowed 1660 // in their paired GPR. So we only need to save the GPR to their home 1661 // slots. 1662 TotalNumIntRegs = 4; 1663 GPR64ArgRegs = GPR64ArgRegsWin64; 1664 } else { 1665 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1666 GPR64ArgRegs = GPR64ArgRegs64Bit; 1667 1668 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1669 } 1670 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1671 TotalNumIntRegs); 1672 1673 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1674 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1675 "SSE register cannot be used when SSE is disabled!"); 1676 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1677 "SSE register cannot be used when SSE is disabled!"); 1678 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) 1679 // Kernel mode asks for SSE to be disabled, so don't push them 1680 // on the stack. 1681 TotalNumXMMRegs = 0; 1682 1683 if (IsWin64) { 1684 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1685 // Get to the caller-allocated home save location. Add 8 to account 1686 // for the return address. 1687 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1688 FuncInfo->setRegSaveFrameIndex( 1689 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1690 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1691 } else { 1692 // For X86-64, if there are vararg parameters that are passed via 1693 // registers, then we must store them to their spots on the stack so they 1694 // may be loaded by deferencing the result of va_next. 1695 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1696 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1697 FuncInfo->setRegSaveFrameIndex( 1698 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1699 false)); 1700 } 1701 1702 // Store the integer parameter registers. 1703 SmallVector<SDValue, 8> MemOps; 1704 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1705 getPointerTy()); 1706 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1707 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1708 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1709 DAG.getIntPtrConstant(Offset)); 1710 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1711 X86::GR64RegisterClass, dl); 1712 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1713 SDValue Store = 1714 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1715 MachinePointerInfo::getFixedStack( 1716 FuncInfo->getRegSaveFrameIndex(), Offset), 1717 false, false, 0); 1718 MemOps.push_back(Store); 1719 Offset += 8; 1720 } 1721 1722 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1723 // Now store the XMM (fp + vector) parameter registers. 1724 SmallVector<SDValue, 11> SaveXMMOps; 1725 SaveXMMOps.push_back(Chain); 1726 1727 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass, dl); 1728 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1729 SaveXMMOps.push_back(ALVal); 1730 1731 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1732 FuncInfo->getRegSaveFrameIndex())); 1733 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1734 FuncInfo->getVarArgsFPOffset())); 1735 1736 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1737 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1738 X86::VR128RegisterClass, dl); 1739 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1740 SaveXMMOps.push_back(Val); 1741 } 1742 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1743 MVT::Other, 1744 &SaveXMMOps[0], SaveXMMOps.size())); 1745 } 1746 1747 if (!MemOps.empty()) 1748 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1749 &MemOps[0], MemOps.size()); 1750 } 1751 } 1752 1753 // Some CCs need callee pop. 1754 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1755 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1756 } else { 1757 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1758 // If this is an sret function, the return should pop the hidden pointer. 1759 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1760 FuncInfo->setBytesToPopOnReturn(4); 1761 } 1762 1763 if (!Is64Bit) { 1764 // RegSaveFrameIndex is X86-64 only. 1765 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1766 if (CallConv == CallingConv::X86_FastCall || 1767 CallConv == CallingConv::X86_ThisCall) 1768 // fastcc functions can't have varargs. 1769 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1770 } 1771 1772 return Chain; 1773} 1774 1775SDValue 1776X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1777 SDValue StackPtr, SDValue Arg, 1778 DebugLoc dl, SelectionDAG &DAG, 1779 const CCValAssign &VA, 1780 ISD::ArgFlagsTy Flags) const { 1781 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1782 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1783 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1784 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1785 if (Flags.isByVal()) 1786 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1787 1788 return DAG.getStore(Chain, dl, Arg, PtrOff, 1789 MachinePointerInfo::getStack(LocMemOffset), 1790 false, false, 0); 1791} 1792 1793/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1794/// optimization is performed and it is required. 1795SDValue 1796X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1797 SDValue &OutRetAddr, SDValue Chain, 1798 bool IsTailCall, bool Is64Bit, 1799 int FPDiff, DebugLoc dl) const { 1800 // Adjust the Return address stack slot. 1801 EVT VT = getPointerTy(); 1802 OutRetAddr = getReturnAddressFrameIndex(DAG); 1803 1804 // Load the "old" Return address. 1805 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1806 false, false, 0); 1807 return SDValue(OutRetAddr.getNode(), 1); 1808} 1809 1810/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1811/// optimization is performed and it is required (FPDiff!=0). 1812static SDValue 1813EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1814 SDValue Chain, SDValue RetAddrFrIdx, 1815 bool Is64Bit, int FPDiff, DebugLoc dl) { 1816 // Store the return address to the appropriate stack slot. 1817 if (!FPDiff) return Chain; 1818 // Calculate the new stack slot for the return address. 1819 int SlotSize = Is64Bit ? 8 : 4; 1820 int NewReturnAddrFI = 1821 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1822 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1823 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1824 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1825 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1826 false, false, 0); 1827 return Chain; 1828} 1829 1830SDValue 1831X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1832 CallingConv::ID CallConv, bool isVarArg, 1833 bool &isTailCall, 1834 const SmallVectorImpl<ISD::OutputArg> &Outs, 1835 const SmallVectorImpl<SDValue> &OutVals, 1836 const SmallVectorImpl<ISD::InputArg> &Ins, 1837 DebugLoc dl, SelectionDAG &DAG, 1838 SmallVectorImpl<SDValue> &InVals) const { 1839 MachineFunction &MF = DAG.getMachineFunction(); 1840 bool Is64Bit = Subtarget->is64Bit(); 1841 bool IsStructRet = CallIsStructReturn(Outs); 1842 bool IsSibcall = false; 1843 1844 if (isTailCall) { 1845 // Check if it's really possible to do a tail call. 1846 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1847 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1848 Outs, OutVals, Ins, DAG); 1849 1850 // Sibcalls are automatically detected tailcalls which do not require 1851 // ABI changes. 1852 if (!GuaranteedTailCallOpt && isTailCall) 1853 IsSibcall = true; 1854 1855 if (isTailCall) 1856 ++NumTailCalls; 1857 } 1858 1859 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1860 "Var args not supported with calling convention fastcc or ghc"); 1861 1862 // Analyze operands of the call, assigning locations to each operand. 1863 SmallVector<CCValAssign, 16> ArgLocs; 1864 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1865 ArgLocs, *DAG.getContext()); 1866 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 1867 1868 // Get a count of how many bytes are to be pushed on the stack. 1869 unsigned NumBytes = CCInfo.getNextStackOffset(); 1870 if (IsSibcall) 1871 // This is a sibcall. The memory operands are available in caller's 1872 // own caller's stack. 1873 NumBytes = 0; 1874 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1875 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1876 1877 int FPDiff = 0; 1878 if (isTailCall && !IsSibcall) { 1879 // Lower arguments at fp - stackoffset + fpdiff. 1880 unsigned NumBytesCallerPushed = 1881 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1882 FPDiff = NumBytesCallerPushed - NumBytes; 1883 1884 // Set the delta of movement of the returnaddr stackslot. 1885 // But only set if delta is greater than previous delta. 1886 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1887 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1888 } 1889 1890 if (!IsSibcall) 1891 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1892 1893 SDValue RetAddrFrIdx; 1894 // Load return adress for tail calls. 1895 if (isTailCall && FPDiff) 1896 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1897 Is64Bit, FPDiff, dl); 1898 1899 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1900 SmallVector<SDValue, 8> MemOpChains; 1901 SDValue StackPtr; 1902 1903 // Walk the register/memloc assignments, inserting copies/loads. In the case 1904 // of tail call optimization arguments are handle later. 1905 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1906 CCValAssign &VA = ArgLocs[i]; 1907 EVT RegVT = VA.getLocVT(); 1908 SDValue Arg = OutVals[i]; 1909 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1910 bool isByVal = Flags.isByVal(); 1911 1912 // Promote the value if needed. 1913 switch (VA.getLocInfo()) { 1914 default: llvm_unreachable("Unknown loc info!"); 1915 case CCValAssign::Full: break; 1916 case CCValAssign::SExt: 1917 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1918 break; 1919 case CCValAssign::ZExt: 1920 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1921 break; 1922 case CCValAssign::AExt: 1923 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1924 // Special case: passing MMX values in XMM registers. 1925 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 1926 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1927 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1928 } else 1929 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1930 break; 1931 case CCValAssign::BCvt: 1932 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 1933 break; 1934 case CCValAssign::Indirect: { 1935 // Store the argument. 1936 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1937 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1938 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1939 MachinePointerInfo::getFixedStack(FI), 1940 false, false, 0); 1941 Arg = SpillSlot; 1942 break; 1943 } 1944 } 1945 1946 if (VA.isRegLoc()) { 1947 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1948 if (isVarArg && Subtarget->isTargetWin64()) { 1949 // Win64 ABI requires argument XMM reg to be copied to the corresponding 1950 // shadow reg if callee is a varargs function. 1951 unsigned ShadowReg = 0; 1952 switch (VA.getLocReg()) { 1953 case X86::XMM0: ShadowReg = X86::RCX; break; 1954 case X86::XMM1: ShadowReg = X86::RDX; break; 1955 case X86::XMM2: ShadowReg = X86::R8; break; 1956 case X86::XMM3: ShadowReg = X86::R9; break; 1957 } 1958 if (ShadowReg) 1959 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 1960 } 1961 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1962 assert(VA.isMemLoc()); 1963 if (StackPtr.getNode() == 0) 1964 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1965 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1966 dl, DAG, VA, Flags)); 1967 } 1968 } 1969 1970 if (!MemOpChains.empty()) 1971 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1972 &MemOpChains[0], MemOpChains.size()); 1973 1974 // Build a sequence of copy-to-reg nodes chained together with token chain 1975 // and flag operands which copy the outgoing args into registers. 1976 SDValue InFlag; 1977 // Tail call byval lowering might overwrite argument registers so in case of 1978 // tail call optimization the copies to registers are lowered later. 1979 if (!isTailCall) 1980 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1981 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1982 RegsToPass[i].second, InFlag); 1983 InFlag = Chain.getValue(1); 1984 } 1985 1986 if (Subtarget->isPICStyleGOT()) { 1987 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1988 // GOT pointer. 1989 if (!isTailCall) { 1990 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1991 DAG.getNode(X86ISD::GlobalBaseReg, 1992 DebugLoc(), getPointerTy()), 1993 InFlag); 1994 InFlag = Chain.getValue(1); 1995 } else { 1996 // If we are tail calling and generating PIC/GOT style code load the 1997 // address of the callee into ECX. The value in ecx is used as target of 1998 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1999 // for tail calls on PIC/GOT architectures. Normally we would just put the 2000 // address of GOT into ebx and then call target@PLT. But for tail calls 2001 // ebx would be restored (since ebx is callee saved) before jumping to the 2002 // target@PLT. 2003 2004 // Note: The actual moving to ECX is done further down. 2005 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2006 if (G && !G->getGlobal()->hasHiddenVisibility() && 2007 !G->getGlobal()->hasProtectedVisibility()) 2008 Callee = LowerGlobalAddress(Callee, DAG); 2009 else if (isa<ExternalSymbolSDNode>(Callee)) 2010 Callee = LowerExternalSymbol(Callee, DAG); 2011 } 2012 } 2013 2014 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2015 // From AMD64 ABI document: 2016 // For calls that may call functions that use varargs or stdargs 2017 // (prototype-less calls or calls to functions containing ellipsis (...) in 2018 // the declaration) %al is used as hidden argument to specify the number 2019 // of SSE registers used. The contents of %al do not need to match exactly 2020 // the number of registers, but must be an ubound on the number of SSE 2021 // registers used and is in the range 0 - 8 inclusive. 2022 2023 // Count the number of XMM registers allocated. 2024 static const unsigned XMMArgRegs[] = { 2025 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2026 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2027 }; 2028 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2029 assert((Subtarget->hasXMM() || !NumXMMRegs) 2030 && "SSE registers cannot be used when SSE is disabled"); 2031 2032 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2033 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2034 InFlag = Chain.getValue(1); 2035 } 2036 2037 2038 // For tail calls lower the arguments to the 'real' stack slot. 2039 if (isTailCall) { 2040 // Force all the incoming stack arguments to be loaded from the stack 2041 // before any new outgoing arguments are stored to the stack, because the 2042 // outgoing stack slots may alias the incoming argument stack slots, and 2043 // the alias isn't otherwise explicit. This is slightly more conservative 2044 // than necessary, because it means that each store effectively depends 2045 // on every argument instead of just those arguments it would clobber. 2046 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2047 2048 SmallVector<SDValue, 8> MemOpChains2; 2049 SDValue FIN; 2050 int FI = 0; 2051 // Do not flag preceeding copytoreg stuff together with the following stuff. 2052 InFlag = SDValue(); 2053 if (GuaranteedTailCallOpt) { 2054 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2055 CCValAssign &VA = ArgLocs[i]; 2056 if (VA.isRegLoc()) 2057 continue; 2058 assert(VA.isMemLoc()); 2059 SDValue Arg = OutVals[i]; 2060 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2061 // Create frame index. 2062 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2063 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2064 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2065 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2066 2067 if (Flags.isByVal()) { 2068 // Copy relative to framepointer. 2069 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2070 if (StackPtr.getNode() == 0) 2071 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2072 getPointerTy()); 2073 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2074 2075 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2076 ArgChain, 2077 Flags, DAG, dl)); 2078 } else { 2079 // Store relative to framepointer. 2080 MemOpChains2.push_back( 2081 DAG.getStore(ArgChain, dl, Arg, FIN, 2082 MachinePointerInfo::getFixedStack(FI), 2083 false, false, 0)); 2084 } 2085 } 2086 } 2087 2088 if (!MemOpChains2.empty()) 2089 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2090 &MemOpChains2[0], MemOpChains2.size()); 2091 2092 // Copy arguments to their registers. 2093 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2094 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2095 RegsToPass[i].second, InFlag); 2096 InFlag = Chain.getValue(1); 2097 } 2098 InFlag =SDValue(); 2099 2100 // Store the return address to the appropriate stack slot. 2101 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2102 FPDiff, dl); 2103 } 2104 2105 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2106 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2107 // In the 64-bit large code model, we have to make all calls 2108 // through a register, since the call instruction's 32-bit 2109 // pc-relative offset may not be large enough to hold the whole 2110 // address. 2111 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2112 // If the callee is a GlobalAddress node (quite common, every direct call 2113 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2114 // it. 2115 2116 // We should use extra load for direct calls to dllimported functions in 2117 // non-JIT mode. 2118 const GlobalValue *GV = G->getGlobal(); 2119 if (!GV->hasDLLImportLinkage()) { 2120 unsigned char OpFlags = 0; 2121 2122 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2123 // external symbols most go through the PLT in PIC mode. If the symbol 2124 // has hidden or protected visibility, or if it is static or local, then 2125 // we don't need to use the PLT - we can directly call it. 2126 if (Subtarget->isTargetELF() && 2127 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2128 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2129 OpFlags = X86II::MO_PLT; 2130 } else if (Subtarget->isPICStyleStubAny() && 2131 (GV->isDeclaration() || GV->isWeakForLinker()) && 2132 Subtarget->getDarwinVers() < 9) { 2133 // PC-relative references to external symbols should go through $stub, 2134 // unless we're building with the leopard linker or later, which 2135 // automatically synthesizes these stubs. 2136 OpFlags = X86II::MO_DARWIN_STUB; 2137 } 2138 2139 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2140 G->getOffset(), OpFlags); 2141 } 2142 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2143 unsigned char OpFlags = 0; 2144 2145 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2146 // external symbols should go through the PLT. 2147 if (Subtarget->isTargetELF() && 2148 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2149 OpFlags = X86II::MO_PLT; 2150 } else if (Subtarget->isPICStyleStubAny() && 2151 Subtarget->getDarwinVers() < 9) { 2152 // PC-relative references to external symbols should go through $stub, 2153 // unless we're building with the leopard linker or later, which 2154 // automatically synthesizes these stubs. 2155 OpFlags = X86II::MO_DARWIN_STUB; 2156 } 2157 2158 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2159 OpFlags); 2160 } 2161 2162 // Returns a chain & a flag for retval copy to use. 2163 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2164 SmallVector<SDValue, 8> Ops; 2165 2166 if (!IsSibcall && isTailCall) { 2167 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2168 DAG.getIntPtrConstant(0, true), InFlag); 2169 InFlag = Chain.getValue(1); 2170 } 2171 2172 Ops.push_back(Chain); 2173 Ops.push_back(Callee); 2174 2175 if (isTailCall) 2176 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2177 2178 // Add argument registers to the end of the list so that they are known live 2179 // into the call. 2180 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2181 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2182 RegsToPass[i].second.getValueType())); 2183 2184 // Add an implicit use GOT pointer in EBX. 2185 if (!isTailCall && Subtarget->isPICStyleGOT()) 2186 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2187 2188 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2189 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2190 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2191 2192 if (InFlag.getNode()) 2193 Ops.push_back(InFlag); 2194 2195 if (isTailCall) { 2196 // We used to do: 2197 //// If this is the first return lowered for this function, add the regs 2198 //// to the liveout set for the function. 2199 // This isn't right, although it's probably harmless on x86; liveouts 2200 // should be computed from returns not tail calls. Consider a void 2201 // function making a tail call to a function returning int. 2202 return DAG.getNode(X86ISD::TC_RETURN, dl, 2203 NodeTys, &Ops[0], Ops.size()); 2204 } 2205 2206 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2207 InFlag = Chain.getValue(1); 2208 2209 // Create the CALLSEQ_END node. 2210 unsigned NumBytesForCalleeToPush; 2211 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2212 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2213 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2214 // If this is a call to a struct-return function, the callee 2215 // pops the hidden struct pointer, so we have to push it back. 2216 // This is common for Darwin/X86, Linux & Mingw32 targets. 2217 NumBytesForCalleeToPush = 4; 2218 else 2219 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2220 2221 // Returns a flag for retval copy to use. 2222 if (!IsSibcall) { 2223 Chain = DAG.getCALLSEQ_END(Chain, 2224 DAG.getIntPtrConstant(NumBytes, true), 2225 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2226 true), 2227 InFlag); 2228 InFlag = Chain.getValue(1); 2229 } 2230 2231 // Handle result values, copying them out of physregs into vregs that we 2232 // return. 2233 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2234 Ins, dl, DAG, InVals); 2235} 2236 2237 2238//===----------------------------------------------------------------------===// 2239// Fast Calling Convention (tail call) implementation 2240//===----------------------------------------------------------------------===// 2241 2242// Like std call, callee cleans arguments, convention except that ECX is 2243// reserved for storing the tail called function address. Only 2 registers are 2244// free for argument passing (inreg). Tail call optimization is performed 2245// provided: 2246// * tailcallopt is enabled 2247// * caller/callee are fastcc 2248// On X86_64 architecture with GOT-style position independent code only local 2249// (within module) calls are supported at the moment. 2250// To keep the stack aligned according to platform abi the function 2251// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2252// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2253// If a tail called function callee has more arguments than the caller the 2254// caller needs to make sure that there is room to move the RETADDR to. This is 2255// achieved by reserving an area the size of the argument delta right after the 2256// original REtADDR, but before the saved framepointer or the spilled registers 2257// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2258// stack layout: 2259// arg1 2260// arg2 2261// RETADDR 2262// [ new RETADDR 2263// move area ] 2264// (possible EBP) 2265// ESI 2266// EDI 2267// local1 .. 2268 2269/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2270/// for a 16 byte align requirement. 2271unsigned 2272X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2273 SelectionDAG& DAG) const { 2274 MachineFunction &MF = DAG.getMachineFunction(); 2275 const TargetMachine &TM = MF.getTarget(); 2276 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2277 unsigned StackAlignment = TFI.getStackAlignment(); 2278 uint64_t AlignMask = StackAlignment - 1; 2279 int64_t Offset = StackSize; 2280 uint64_t SlotSize = TD->getPointerSize(); 2281 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2282 // Number smaller than 12 so just add the difference. 2283 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2284 } else { 2285 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2286 Offset = ((~AlignMask) & Offset) + StackAlignment + 2287 (StackAlignment-SlotSize); 2288 } 2289 return Offset; 2290} 2291 2292/// MatchingStackOffset - Return true if the given stack call argument is 2293/// already available in the same position (relatively) of the caller's 2294/// incoming argument stack. 2295static 2296bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2297 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2298 const X86InstrInfo *TII) { 2299 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2300 int FI = INT_MAX; 2301 if (Arg.getOpcode() == ISD::CopyFromReg) { 2302 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2303 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2304 return false; 2305 MachineInstr *Def = MRI->getVRegDef(VR); 2306 if (!Def) 2307 return false; 2308 if (!Flags.isByVal()) { 2309 if (!TII->isLoadFromStackSlot(Def, FI)) 2310 return false; 2311 } else { 2312 unsigned Opcode = Def->getOpcode(); 2313 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2314 Def->getOperand(1).isFI()) { 2315 FI = Def->getOperand(1).getIndex(); 2316 Bytes = Flags.getByValSize(); 2317 } else 2318 return false; 2319 } 2320 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2321 if (Flags.isByVal()) 2322 // ByVal argument is passed in as a pointer but it's now being 2323 // dereferenced. e.g. 2324 // define @foo(%struct.X* %A) { 2325 // tail call @bar(%struct.X* byval %A) 2326 // } 2327 return false; 2328 SDValue Ptr = Ld->getBasePtr(); 2329 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2330 if (!FINode) 2331 return false; 2332 FI = FINode->getIndex(); 2333 } else 2334 return false; 2335 2336 assert(FI != INT_MAX); 2337 if (!MFI->isFixedObjectIndex(FI)) 2338 return false; 2339 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2340} 2341 2342/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2343/// for tail call optimization. Targets which want to do tail call 2344/// optimization should implement this function. 2345bool 2346X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2347 CallingConv::ID CalleeCC, 2348 bool isVarArg, 2349 bool isCalleeStructRet, 2350 bool isCallerStructRet, 2351 const SmallVectorImpl<ISD::OutputArg> &Outs, 2352 const SmallVectorImpl<SDValue> &OutVals, 2353 const SmallVectorImpl<ISD::InputArg> &Ins, 2354 SelectionDAG& DAG) const { 2355 if (!IsTailCallConvention(CalleeCC) && 2356 CalleeCC != CallingConv::C) 2357 return false; 2358 2359 // If -tailcallopt is specified, make fastcc functions tail-callable. 2360 const MachineFunction &MF = DAG.getMachineFunction(); 2361 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2362 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2363 bool CCMatch = CallerCC == CalleeCC; 2364 2365 if (GuaranteedTailCallOpt) { 2366 if (IsTailCallConvention(CalleeCC) && CCMatch) 2367 return true; 2368 return false; 2369 } 2370 2371 // Look for obvious safe cases to perform tail call optimization that do not 2372 // require ABI changes. This is what gcc calls sibcall. 2373 2374 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2375 // emit a special epilogue. 2376 if (RegInfo->needsStackRealignment(MF)) 2377 return false; 2378 2379 // Do not sibcall optimize vararg calls unless the call site is not passing 2380 // any arguments. 2381 if (isVarArg && !Outs.empty()) 2382 return false; 2383 2384 // Also avoid sibcall optimization if either caller or callee uses struct 2385 // return semantics. 2386 if (isCalleeStructRet || isCallerStructRet) 2387 return false; 2388 2389 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2390 // Therefore if it's not used by the call it is not safe to optimize this into 2391 // a sibcall. 2392 bool Unused = false; 2393 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2394 if (!Ins[i].Used) { 2395 Unused = true; 2396 break; 2397 } 2398 } 2399 if (Unused) { 2400 SmallVector<CCValAssign, 16> RVLocs; 2401 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2402 RVLocs, *DAG.getContext()); 2403 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2404 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2405 CCValAssign &VA = RVLocs[i]; 2406 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2407 return false; 2408 } 2409 } 2410 2411 // If the calling conventions do not match, then we'd better make sure the 2412 // results are returned in the same way as what the caller expects. 2413 if (!CCMatch) { 2414 SmallVector<CCValAssign, 16> RVLocs1; 2415 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2416 RVLocs1, *DAG.getContext()); 2417 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2418 2419 SmallVector<CCValAssign, 16> RVLocs2; 2420 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2421 RVLocs2, *DAG.getContext()); 2422 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2423 2424 if (RVLocs1.size() != RVLocs2.size()) 2425 return false; 2426 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2427 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2428 return false; 2429 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2430 return false; 2431 if (RVLocs1[i].isRegLoc()) { 2432 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2433 return false; 2434 } else { 2435 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2436 return false; 2437 } 2438 } 2439 } 2440 2441 // If the callee takes no arguments then go on to check the results of the 2442 // call. 2443 if (!Outs.empty()) { 2444 // Check if stack adjustment is needed. For now, do not do this if any 2445 // argument is passed on the stack. 2446 SmallVector<CCValAssign, 16> ArgLocs; 2447 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2448 ArgLocs, *DAG.getContext()); 2449 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2450 if (CCInfo.getNextStackOffset()) { 2451 MachineFunction &MF = DAG.getMachineFunction(); 2452 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2453 return false; 2454 2455 // Check if the arguments are already laid out in the right way as 2456 // the caller's fixed stack objects. 2457 MachineFrameInfo *MFI = MF.getFrameInfo(); 2458 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2459 const X86InstrInfo *TII = 2460 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2461 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2462 CCValAssign &VA = ArgLocs[i]; 2463 SDValue Arg = OutVals[i]; 2464 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2465 if (VA.getLocInfo() == CCValAssign::Indirect) 2466 return false; 2467 if (!VA.isRegLoc()) { 2468 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2469 MFI, MRI, TII)) 2470 return false; 2471 } 2472 } 2473 } 2474 2475 // If the tailcall address may be in a register, then make sure it's 2476 // possible to register allocate for it. In 32-bit, the call address can 2477 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2478 // callee-saved registers are restored. These happen to be the same 2479 // registers used to pass 'inreg' arguments so watch out for those. 2480 if (!Subtarget->is64Bit() && 2481 !isa<GlobalAddressSDNode>(Callee) && 2482 !isa<ExternalSymbolSDNode>(Callee)) { 2483 unsigned NumInRegs = 0; 2484 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2485 CCValAssign &VA = ArgLocs[i]; 2486 if (!VA.isRegLoc()) 2487 continue; 2488 unsigned Reg = VA.getLocReg(); 2489 switch (Reg) { 2490 default: break; 2491 case X86::EAX: case X86::EDX: case X86::ECX: 2492 if (++NumInRegs == 3) 2493 return false; 2494 break; 2495 } 2496 } 2497 } 2498 } 2499 2500 // An stdcall caller is expected to clean up its arguments; the callee 2501 // isn't going to do that. 2502 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2503 return false; 2504 2505 return true; 2506} 2507 2508FastISel * 2509X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2510 return X86::createFastISel(funcInfo); 2511} 2512 2513 2514//===----------------------------------------------------------------------===// 2515// Other Lowering Hooks 2516//===----------------------------------------------------------------------===// 2517 2518static bool MayFoldLoad(SDValue Op) { 2519 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2520} 2521 2522static bool MayFoldIntoStore(SDValue Op) { 2523 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2524} 2525 2526static bool isTargetShuffle(unsigned Opcode) { 2527 switch(Opcode) { 2528 default: return false; 2529 case X86ISD::PSHUFD: 2530 case X86ISD::PSHUFHW: 2531 case X86ISD::PSHUFLW: 2532 case X86ISD::SHUFPD: 2533 case X86ISD::PALIGN: 2534 case X86ISD::SHUFPS: 2535 case X86ISD::MOVLHPS: 2536 case X86ISD::MOVLHPD: 2537 case X86ISD::MOVHLPS: 2538 case X86ISD::MOVLPS: 2539 case X86ISD::MOVLPD: 2540 case X86ISD::MOVSHDUP: 2541 case X86ISD::MOVSLDUP: 2542 case X86ISD::MOVDDUP: 2543 case X86ISD::MOVSS: 2544 case X86ISD::MOVSD: 2545 case X86ISD::UNPCKLPS: 2546 case X86ISD::UNPCKLPD: 2547 case X86ISD::PUNPCKLWD: 2548 case X86ISD::PUNPCKLBW: 2549 case X86ISD::PUNPCKLDQ: 2550 case X86ISD::PUNPCKLQDQ: 2551 case X86ISD::UNPCKHPS: 2552 case X86ISD::UNPCKHPD: 2553 case X86ISD::PUNPCKHWD: 2554 case X86ISD::PUNPCKHBW: 2555 case X86ISD::PUNPCKHDQ: 2556 case X86ISD::PUNPCKHQDQ: 2557 return true; 2558 } 2559 return false; 2560} 2561 2562static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2563 SDValue V1, SelectionDAG &DAG) { 2564 switch(Opc) { 2565 default: llvm_unreachable("Unknown x86 shuffle node"); 2566 case X86ISD::MOVSHDUP: 2567 case X86ISD::MOVSLDUP: 2568 case X86ISD::MOVDDUP: 2569 return DAG.getNode(Opc, dl, VT, V1); 2570 } 2571 2572 return SDValue(); 2573} 2574 2575static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2576 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2577 switch(Opc) { 2578 default: llvm_unreachable("Unknown x86 shuffle node"); 2579 case X86ISD::PSHUFD: 2580 case X86ISD::PSHUFHW: 2581 case X86ISD::PSHUFLW: 2582 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2583 } 2584 2585 return SDValue(); 2586} 2587 2588static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2589 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2590 switch(Opc) { 2591 default: llvm_unreachable("Unknown x86 shuffle node"); 2592 case X86ISD::PALIGN: 2593 case X86ISD::SHUFPD: 2594 case X86ISD::SHUFPS: 2595 return DAG.getNode(Opc, dl, VT, V1, V2, 2596 DAG.getConstant(TargetMask, MVT::i8)); 2597 } 2598 return SDValue(); 2599} 2600 2601static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2602 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2603 switch(Opc) { 2604 default: llvm_unreachable("Unknown x86 shuffle node"); 2605 case X86ISD::MOVLHPS: 2606 case X86ISD::MOVLHPD: 2607 case X86ISD::MOVHLPS: 2608 case X86ISD::MOVLPS: 2609 case X86ISD::MOVLPD: 2610 case X86ISD::MOVSS: 2611 case X86ISD::MOVSD: 2612 case X86ISD::UNPCKLPS: 2613 case X86ISD::UNPCKLPD: 2614 case X86ISD::PUNPCKLWD: 2615 case X86ISD::PUNPCKLBW: 2616 case X86ISD::PUNPCKLDQ: 2617 case X86ISD::PUNPCKLQDQ: 2618 case X86ISD::UNPCKHPS: 2619 case X86ISD::UNPCKHPD: 2620 case X86ISD::PUNPCKHWD: 2621 case X86ISD::PUNPCKHBW: 2622 case X86ISD::PUNPCKHDQ: 2623 case X86ISD::PUNPCKHQDQ: 2624 return DAG.getNode(Opc, dl, VT, V1, V2); 2625 } 2626 return SDValue(); 2627} 2628 2629SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2630 MachineFunction &MF = DAG.getMachineFunction(); 2631 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2632 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2633 2634 if (ReturnAddrIndex == 0) { 2635 // Set up a frame object for the return address. 2636 uint64_t SlotSize = TD->getPointerSize(); 2637 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2638 false); 2639 FuncInfo->setRAIndex(ReturnAddrIndex); 2640 } 2641 2642 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2643} 2644 2645 2646bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2647 bool hasSymbolicDisplacement) { 2648 // Offset should fit into 32 bit immediate field. 2649 if (!isInt<32>(Offset)) 2650 return false; 2651 2652 // If we don't have a symbolic displacement - we don't have any extra 2653 // restrictions. 2654 if (!hasSymbolicDisplacement) 2655 return true; 2656 2657 // FIXME: Some tweaks might be needed for medium code model. 2658 if (M != CodeModel::Small && M != CodeModel::Kernel) 2659 return false; 2660 2661 // For small code model we assume that latest object is 16MB before end of 31 2662 // bits boundary. We may also accept pretty large negative constants knowing 2663 // that all objects are in the positive half of address space. 2664 if (M == CodeModel::Small && Offset < 16*1024*1024) 2665 return true; 2666 2667 // For kernel code model we know that all object resist in the negative half 2668 // of 32bits address space. We may not accept negative offsets, since they may 2669 // be just off and we may accept pretty large positive ones. 2670 if (M == CodeModel::Kernel && Offset > 0) 2671 return true; 2672 2673 return false; 2674} 2675 2676/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2677/// specific condition code, returning the condition code and the LHS/RHS of the 2678/// comparison to make. 2679static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2680 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2681 if (!isFP) { 2682 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2683 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2684 // X > -1 -> X == 0, jump !sign. 2685 RHS = DAG.getConstant(0, RHS.getValueType()); 2686 return X86::COND_NS; 2687 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2688 // X < 0 -> X == 0, jump on sign. 2689 return X86::COND_S; 2690 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2691 // X < 1 -> X <= 0 2692 RHS = DAG.getConstant(0, RHS.getValueType()); 2693 return X86::COND_LE; 2694 } 2695 } 2696 2697 switch (SetCCOpcode) { 2698 default: llvm_unreachable("Invalid integer condition!"); 2699 case ISD::SETEQ: return X86::COND_E; 2700 case ISD::SETGT: return X86::COND_G; 2701 case ISD::SETGE: return X86::COND_GE; 2702 case ISD::SETLT: return X86::COND_L; 2703 case ISD::SETLE: return X86::COND_LE; 2704 case ISD::SETNE: return X86::COND_NE; 2705 case ISD::SETULT: return X86::COND_B; 2706 case ISD::SETUGT: return X86::COND_A; 2707 case ISD::SETULE: return X86::COND_BE; 2708 case ISD::SETUGE: return X86::COND_AE; 2709 } 2710 } 2711 2712 // First determine if it is required or is profitable to flip the operands. 2713 2714 // If LHS is a foldable load, but RHS is not, flip the condition. 2715 if (ISD::isNON_EXTLoad(LHS.getNode()) && 2716 !ISD::isNON_EXTLoad(RHS.getNode())) { 2717 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2718 std::swap(LHS, RHS); 2719 } 2720 2721 switch (SetCCOpcode) { 2722 default: break; 2723 case ISD::SETOLT: 2724 case ISD::SETOLE: 2725 case ISD::SETUGT: 2726 case ISD::SETUGE: 2727 std::swap(LHS, RHS); 2728 break; 2729 } 2730 2731 // On a floating point condition, the flags are set as follows: 2732 // ZF PF CF op 2733 // 0 | 0 | 0 | X > Y 2734 // 0 | 0 | 1 | X < Y 2735 // 1 | 0 | 0 | X == Y 2736 // 1 | 1 | 1 | unordered 2737 switch (SetCCOpcode) { 2738 default: llvm_unreachable("Condcode should be pre-legalized away"); 2739 case ISD::SETUEQ: 2740 case ISD::SETEQ: return X86::COND_E; 2741 case ISD::SETOLT: // flipped 2742 case ISD::SETOGT: 2743 case ISD::SETGT: return X86::COND_A; 2744 case ISD::SETOLE: // flipped 2745 case ISD::SETOGE: 2746 case ISD::SETGE: return X86::COND_AE; 2747 case ISD::SETUGT: // flipped 2748 case ISD::SETULT: 2749 case ISD::SETLT: return X86::COND_B; 2750 case ISD::SETUGE: // flipped 2751 case ISD::SETULE: 2752 case ISD::SETLE: return X86::COND_BE; 2753 case ISD::SETONE: 2754 case ISD::SETNE: return X86::COND_NE; 2755 case ISD::SETUO: return X86::COND_P; 2756 case ISD::SETO: return X86::COND_NP; 2757 case ISD::SETOEQ: 2758 case ISD::SETUNE: return X86::COND_INVALID; 2759 } 2760} 2761 2762/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2763/// code. Current x86 isa includes the following FP cmov instructions: 2764/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2765static bool hasFPCMov(unsigned X86CC) { 2766 switch (X86CC) { 2767 default: 2768 return false; 2769 case X86::COND_B: 2770 case X86::COND_BE: 2771 case X86::COND_E: 2772 case X86::COND_P: 2773 case X86::COND_A: 2774 case X86::COND_AE: 2775 case X86::COND_NE: 2776 case X86::COND_NP: 2777 return true; 2778 } 2779} 2780 2781/// isFPImmLegal - Returns true if the target can instruction select the 2782/// specified FP immediate natively. If false, the legalizer will 2783/// materialize the FP immediate as a load from a constant pool. 2784bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2785 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2786 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2787 return true; 2788 } 2789 return false; 2790} 2791 2792/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2793/// the specified range (L, H]. 2794static bool isUndefOrInRange(int Val, int Low, int Hi) { 2795 return (Val < 0) || (Val >= Low && Val < Hi); 2796} 2797 2798/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2799/// specified value. 2800static bool isUndefOrEqual(int Val, int CmpVal) { 2801 if (Val < 0 || Val == CmpVal) 2802 return true; 2803 return false; 2804} 2805 2806/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2807/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2808/// the second operand. 2809static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2810 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 2811 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2812 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2813 return (Mask[0] < 2 && Mask[1] < 2); 2814 return false; 2815} 2816 2817bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2818 SmallVector<int, 8> M; 2819 N->getMask(M); 2820 return ::isPSHUFDMask(M, N->getValueType(0)); 2821} 2822 2823/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2824/// is suitable for input to PSHUFHW. 2825static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2826 if (VT != MVT::v8i16) 2827 return false; 2828 2829 // Lower quadword copied in order or undef. 2830 for (int i = 0; i != 4; ++i) 2831 if (Mask[i] >= 0 && Mask[i] != i) 2832 return false; 2833 2834 // Upper quadword shuffled. 2835 for (int i = 4; i != 8; ++i) 2836 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2837 return false; 2838 2839 return true; 2840} 2841 2842bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2843 SmallVector<int, 8> M; 2844 N->getMask(M); 2845 return ::isPSHUFHWMask(M, N->getValueType(0)); 2846} 2847 2848/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2849/// is suitable for input to PSHUFLW. 2850static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2851 if (VT != MVT::v8i16) 2852 return false; 2853 2854 // Upper quadword copied in order. 2855 for (int i = 4; i != 8; ++i) 2856 if (Mask[i] >= 0 && Mask[i] != i) 2857 return false; 2858 2859 // Lower quadword shuffled. 2860 for (int i = 0; i != 4; ++i) 2861 if (Mask[i] >= 4) 2862 return false; 2863 2864 return true; 2865} 2866 2867bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2868 SmallVector<int, 8> M; 2869 N->getMask(M); 2870 return ::isPSHUFLWMask(M, N->getValueType(0)); 2871} 2872 2873/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2874/// is suitable for input to PALIGNR. 2875static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2876 bool hasSSSE3) { 2877 int i, e = VT.getVectorNumElements(); 2878 2879 // Do not handle v2i64 / v2f64 shuffles with palignr. 2880 if (e < 4 || !hasSSSE3) 2881 return false; 2882 2883 for (i = 0; i != e; ++i) 2884 if (Mask[i] >= 0) 2885 break; 2886 2887 // All undef, not a palignr. 2888 if (i == e) 2889 return false; 2890 2891 // Determine if it's ok to perform a palignr with only the LHS, since we 2892 // don't have access to the actual shuffle elements to see if RHS is undef. 2893 bool Unary = Mask[i] < (int)e; 2894 bool NeedsUnary = false; 2895 2896 int s = Mask[i] - i; 2897 2898 // Check the rest of the elements to see if they are consecutive. 2899 for (++i; i != e; ++i) { 2900 int m = Mask[i]; 2901 if (m < 0) 2902 continue; 2903 2904 Unary = Unary && (m < (int)e); 2905 NeedsUnary = NeedsUnary || (m < s); 2906 2907 if (NeedsUnary && !Unary) 2908 return false; 2909 if (Unary && m != ((s+i) & (e-1))) 2910 return false; 2911 if (!Unary && m != (s+i)) 2912 return false; 2913 } 2914 return true; 2915} 2916 2917bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2918 SmallVector<int, 8> M; 2919 N->getMask(M); 2920 return ::isPALIGNRMask(M, N->getValueType(0), true); 2921} 2922 2923/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2924/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2925static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2926 int NumElems = VT.getVectorNumElements(); 2927 if (NumElems != 2 && NumElems != 4) 2928 return false; 2929 2930 int Half = NumElems / 2; 2931 for (int i = 0; i < Half; ++i) 2932 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2933 return false; 2934 for (int i = Half; i < NumElems; ++i) 2935 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2936 return false; 2937 2938 return true; 2939} 2940 2941bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2942 SmallVector<int, 8> M; 2943 N->getMask(M); 2944 return ::isSHUFPMask(M, N->getValueType(0)); 2945} 2946 2947/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2948/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2949/// half elements to come from vector 1 (which would equal the dest.) and 2950/// the upper half to come from vector 2. 2951static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2952 int NumElems = VT.getVectorNumElements(); 2953 2954 if (NumElems != 2 && NumElems != 4) 2955 return false; 2956 2957 int Half = NumElems / 2; 2958 for (int i = 0; i < Half; ++i) 2959 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2960 return false; 2961 for (int i = Half; i < NumElems; ++i) 2962 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2963 return false; 2964 return true; 2965} 2966 2967static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2968 SmallVector<int, 8> M; 2969 N->getMask(M); 2970 return isCommutedSHUFPMask(M, N->getValueType(0)); 2971} 2972 2973/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2974/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2975bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2976 if (N->getValueType(0).getVectorNumElements() != 4) 2977 return false; 2978 2979 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2980 return isUndefOrEqual(N->getMaskElt(0), 6) && 2981 isUndefOrEqual(N->getMaskElt(1), 7) && 2982 isUndefOrEqual(N->getMaskElt(2), 2) && 2983 isUndefOrEqual(N->getMaskElt(3), 3); 2984} 2985 2986/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2987/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2988/// <2, 3, 2, 3> 2989bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2990 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2991 2992 if (NumElems != 4) 2993 return false; 2994 2995 return isUndefOrEqual(N->getMaskElt(0), 2) && 2996 isUndefOrEqual(N->getMaskElt(1), 3) && 2997 isUndefOrEqual(N->getMaskElt(2), 2) && 2998 isUndefOrEqual(N->getMaskElt(3), 3); 2999} 3000 3001/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3002/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3003bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3004 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3005 3006 if (NumElems != 2 && NumElems != 4) 3007 return false; 3008 3009 for (unsigned i = 0; i < NumElems/2; ++i) 3010 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3011 return false; 3012 3013 for (unsigned i = NumElems/2; i < NumElems; ++i) 3014 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3015 return false; 3016 3017 return true; 3018} 3019 3020/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3021/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3022bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3023 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3024 3025 if (NumElems != 2 && NumElems != 4) 3026 return false; 3027 3028 for (unsigned i = 0; i < NumElems/2; ++i) 3029 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3030 return false; 3031 3032 for (unsigned i = 0; i < NumElems/2; ++i) 3033 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3034 return false; 3035 3036 return true; 3037} 3038 3039/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3040/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3041static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3042 bool V2IsSplat = false) { 3043 int NumElts = VT.getVectorNumElements(); 3044 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3045 return false; 3046 3047 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3048 int BitI = Mask[i]; 3049 int BitI1 = Mask[i+1]; 3050 if (!isUndefOrEqual(BitI, j)) 3051 return false; 3052 if (V2IsSplat) { 3053 if (!isUndefOrEqual(BitI1, NumElts)) 3054 return false; 3055 } else { 3056 if (!isUndefOrEqual(BitI1, j + NumElts)) 3057 return false; 3058 } 3059 } 3060 return true; 3061} 3062 3063bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3064 SmallVector<int, 8> M; 3065 N->getMask(M); 3066 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3067} 3068 3069/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3070/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3071static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3072 bool V2IsSplat = false) { 3073 int NumElts = VT.getVectorNumElements(); 3074 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3075 return false; 3076 3077 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3078 int BitI = Mask[i]; 3079 int BitI1 = Mask[i+1]; 3080 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3081 return false; 3082 if (V2IsSplat) { 3083 if (isUndefOrEqual(BitI1, NumElts)) 3084 return false; 3085 } else { 3086 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3087 return false; 3088 } 3089 } 3090 return true; 3091} 3092 3093bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3094 SmallVector<int, 8> M; 3095 N->getMask(M); 3096 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3097} 3098 3099/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3100/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3101/// <0, 0, 1, 1> 3102static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3103 int NumElems = VT.getVectorNumElements(); 3104 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3105 return false; 3106 3107 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3108 int BitI = Mask[i]; 3109 int BitI1 = Mask[i+1]; 3110 if (!isUndefOrEqual(BitI, j)) 3111 return false; 3112 if (!isUndefOrEqual(BitI1, j)) 3113 return false; 3114 } 3115 return true; 3116} 3117 3118bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3119 SmallVector<int, 8> M; 3120 N->getMask(M); 3121 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3122} 3123 3124/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3125/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3126/// <2, 2, 3, 3> 3127static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3128 int NumElems = VT.getVectorNumElements(); 3129 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3130 return false; 3131 3132 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3133 int BitI = Mask[i]; 3134 int BitI1 = Mask[i+1]; 3135 if (!isUndefOrEqual(BitI, j)) 3136 return false; 3137 if (!isUndefOrEqual(BitI1, j)) 3138 return false; 3139 } 3140 return true; 3141} 3142 3143bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3144 SmallVector<int, 8> M; 3145 N->getMask(M); 3146 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3147} 3148 3149/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3150/// specifies a shuffle of elements that is suitable for input to MOVSS, 3151/// MOVSD, and MOVD, i.e. setting the lowest element. 3152static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3153 if (VT.getVectorElementType().getSizeInBits() < 32) 3154 return false; 3155 3156 int NumElts = VT.getVectorNumElements(); 3157 3158 if (!isUndefOrEqual(Mask[0], NumElts)) 3159 return false; 3160 3161 for (int i = 1; i < NumElts; ++i) 3162 if (!isUndefOrEqual(Mask[i], i)) 3163 return false; 3164 3165 return true; 3166} 3167 3168bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3169 SmallVector<int, 8> M; 3170 N->getMask(M); 3171 return ::isMOVLMask(M, N->getValueType(0)); 3172} 3173 3174/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3175/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3176/// element of vector 2 and the other elements to come from vector 1 in order. 3177static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3178 bool V2IsSplat = false, bool V2IsUndef = false) { 3179 int NumOps = VT.getVectorNumElements(); 3180 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3181 return false; 3182 3183 if (!isUndefOrEqual(Mask[0], 0)) 3184 return false; 3185 3186 for (int i = 1; i < NumOps; ++i) 3187 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3188 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3189 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3190 return false; 3191 3192 return true; 3193} 3194 3195static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3196 bool V2IsUndef = false) { 3197 SmallVector<int, 8> M; 3198 N->getMask(M); 3199 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3200} 3201 3202/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3203/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3204bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3205 if (N->getValueType(0).getVectorNumElements() != 4) 3206 return false; 3207 3208 // Expect 1, 1, 3, 3 3209 for (unsigned i = 0; i < 2; ++i) { 3210 int Elt = N->getMaskElt(i); 3211 if (Elt >= 0 && Elt != 1) 3212 return false; 3213 } 3214 3215 bool HasHi = false; 3216 for (unsigned i = 2; i < 4; ++i) { 3217 int Elt = N->getMaskElt(i); 3218 if (Elt >= 0 && Elt != 3) 3219 return false; 3220 if (Elt == 3) 3221 HasHi = true; 3222 } 3223 // Don't use movshdup if it can be done with a shufps. 3224 // FIXME: verify that matching u, u, 3, 3 is what we want. 3225 return HasHi; 3226} 3227 3228/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3229/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3230bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3231 if (N->getValueType(0).getVectorNumElements() != 4) 3232 return false; 3233 3234 // Expect 0, 0, 2, 2 3235 for (unsigned i = 0; i < 2; ++i) 3236 if (N->getMaskElt(i) > 0) 3237 return false; 3238 3239 bool HasHi = false; 3240 for (unsigned i = 2; i < 4; ++i) { 3241 int Elt = N->getMaskElt(i); 3242 if (Elt >= 0 && Elt != 2) 3243 return false; 3244 if (Elt == 2) 3245 HasHi = true; 3246 } 3247 // Don't use movsldup if it can be done with a shufps. 3248 return HasHi; 3249} 3250 3251/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3252/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3253bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3254 int e = N->getValueType(0).getVectorNumElements() / 2; 3255 3256 for (int i = 0; i < e; ++i) 3257 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3258 return false; 3259 for (int i = 0; i < e; ++i) 3260 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3261 return false; 3262 return true; 3263} 3264 3265/// isVEXTRACTF128Index - Return true if the specified 3266/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3267/// suitable for input to VEXTRACTF128. 3268bool X86::isVEXTRACTF128Index(SDNode *N) { 3269 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3270 return false; 3271 3272 // The index should be aligned on a 128-bit boundary. 3273 uint64_t Index = 3274 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3275 3276 unsigned VL = N->getValueType(0).getVectorNumElements(); 3277 unsigned VBits = N->getValueType(0).getSizeInBits(); 3278 unsigned ElSize = VBits / VL; 3279 bool Result = (Index * ElSize) % 128 == 0; 3280 3281 return Result; 3282} 3283 3284/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3285/// operand specifies a subvector insert that is suitable for input to 3286/// VINSERTF128. 3287bool X86::isVINSERTF128Index(SDNode *N) { 3288 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3289 return false; 3290 3291 // The index should be aligned on a 128-bit boundary. 3292 uint64_t Index = 3293 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3294 3295 unsigned VL = N->getValueType(0).getVectorNumElements(); 3296 unsigned VBits = N->getValueType(0).getSizeInBits(); 3297 unsigned ElSize = VBits / VL; 3298 bool Result = (Index * ElSize) % 128 == 0; 3299 3300 return Result; 3301} 3302 3303/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3304/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3305unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3306 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3307 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3308 3309 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3310 unsigned Mask = 0; 3311 for (int i = 0; i < NumOperands; ++i) { 3312 int Val = SVOp->getMaskElt(NumOperands-i-1); 3313 if (Val < 0) Val = 0; 3314 if (Val >= NumOperands) Val -= NumOperands; 3315 Mask |= Val; 3316 if (i != NumOperands - 1) 3317 Mask <<= Shift; 3318 } 3319 return Mask; 3320} 3321 3322/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3323/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3324unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3325 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3326 unsigned Mask = 0; 3327 // 8 nodes, but we only care about the last 4. 3328 for (unsigned i = 7; i >= 4; --i) { 3329 int Val = SVOp->getMaskElt(i); 3330 if (Val >= 0) 3331 Mask |= (Val - 4); 3332 if (i != 4) 3333 Mask <<= 2; 3334 } 3335 return Mask; 3336} 3337 3338/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3339/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3340unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3341 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3342 unsigned Mask = 0; 3343 // 8 nodes, but we only care about the first 4. 3344 for (int i = 3; i >= 0; --i) { 3345 int Val = SVOp->getMaskElt(i); 3346 if (Val >= 0) 3347 Mask |= Val; 3348 if (i != 0) 3349 Mask <<= 2; 3350 } 3351 return Mask; 3352} 3353 3354/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3355/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3356unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3357 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3358 EVT VVT = N->getValueType(0); 3359 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3360 int Val = 0; 3361 3362 unsigned i, e; 3363 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3364 Val = SVOp->getMaskElt(i); 3365 if (Val >= 0) 3366 break; 3367 } 3368 return (Val - i) * EltSize; 3369} 3370 3371/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 3372/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 3373/// instructions. 3374unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 3375 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3376 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 3377 3378 uint64_t Index = 3379 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3380 3381 EVT VecVT = N->getOperand(0).getValueType(); 3382 EVT ElVT = VecVT.getVectorElementType(); 3383 3384 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3385 3386 return Index / NumElemsPerChunk; 3387} 3388 3389/// getInsertVINSERTF128Immediate - Return the appropriate immediate 3390/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 3391/// instructions. 3392unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 3393 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3394 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 3395 3396 uint64_t Index = 3397 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3398 3399 EVT VecVT = N->getValueType(0); 3400 EVT ElVT = VecVT.getVectorElementType(); 3401 3402 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3403 3404 return Index / NumElemsPerChunk; 3405} 3406 3407/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3408/// constant +0.0. 3409bool X86::isZeroNode(SDValue Elt) { 3410 return ((isa<ConstantSDNode>(Elt) && 3411 cast<ConstantSDNode>(Elt)->isNullValue()) || 3412 (isa<ConstantFPSDNode>(Elt) && 3413 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3414} 3415 3416/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3417/// their permute mask. 3418static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3419 SelectionDAG &DAG) { 3420 EVT VT = SVOp->getValueType(0); 3421 unsigned NumElems = VT.getVectorNumElements(); 3422 SmallVector<int, 8> MaskVec; 3423 3424 for (unsigned i = 0; i != NumElems; ++i) { 3425 int idx = SVOp->getMaskElt(i); 3426 if (idx < 0) 3427 MaskVec.push_back(idx); 3428 else if (idx < (int)NumElems) 3429 MaskVec.push_back(idx + NumElems); 3430 else 3431 MaskVec.push_back(idx - NumElems); 3432 } 3433 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3434 SVOp->getOperand(0), &MaskVec[0]); 3435} 3436 3437/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3438/// the two vector operands have swapped position. 3439static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3440 unsigned NumElems = VT.getVectorNumElements(); 3441 for (unsigned i = 0; i != NumElems; ++i) { 3442 int idx = Mask[i]; 3443 if (idx < 0) 3444 continue; 3445 else if (idx < (int)NumElems) 3446 Mask[i] = idx + NumElems; 3447 else 3448 Mask[i] = idx - NumElems; 3449 } 3450} 3451 3452/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3453/// match movhlps. The lower half elements should come from upper half of 3454/// V1 (and in order), and the upper half elements should come from the upper 3455/// half of V2 (and in order). 3456static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3457 if (Op->getValueType(0).getVectorNumElements() != 4) 3458 return false; 3459 for (unsigned i = 0, e = 2; i != e; ++i) 3460 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3461 return false; 3462 for (unsigned i = 2; i != 4; ++i) 3463 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3464 return false; 3465 return true; 3466} 3467 3468/// isScalarLoadToVector - Returns true if the node is a scalar load that 3469/// is promoted to a vector. It also returns the LoadSDNode by reference if 3470/// required. 3471static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3472 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3473 return false; 3474 N = N->getOperand(0).getNode(); 3475 if (!ISD::isNON_EXTLoad(N)) 3476 return false; 3477 if (LD) 3478 *LD = cast<LoadSDNode>(N); 3479 return true; 3480} 3481 3482/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3483/// match movlp{s|d}. The lower half elements should come from lower half of 3484/// V1 (and in order), and the upper half elements should come from the upper 3485/// half of V2 (and in order). And since V1 will become the source of the 3486/// MOVLP, it must be either a vector load or a scalar load to vector. 3487static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3488 ShuffleVectorSDNode *Op) { 3489 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3490 return false; 3491 // Is V2 is a vector load, don't do this transformation. We will try to use 3492 // load folding shufps op. 3493 if (ISD::isNON_EXTLoad(V2)) 3494 return false; 3495 3496 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3497 3498 if (NumElems != 2 && NumElems != 4) 3499 return false; 3500 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3501 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3502 return false; 3503 for (unsigned i = NumElems/2; i != NumElems; ++i) 3504 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3505 return false; 3506 return true; 3507} 3508 3509/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3510/// all the same. 3511static bool isSplatVector(SDNode *N) { 3512 if (N->getOpcode() != ISD::BUILD_VECTOR) 3513 return false; 3514 3515 SDValue SplatValue = N->getOperand(0); 3516 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3517 if (N->getOperand(i) != SplatValue) 3518 return false; 3519 return true; 3520} 3521 3522/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3523/// to an zero vector. 3524/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3525static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3526 SDValue V1 = N->getOperand(0); 3527 SDValue V2 = N->getOperand(1); 3528 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3529 for (unsigned i = 0; i != NumElems; ++i) { 3530 int Idx = N->getMaskElt(i); 3531 if (Idx >= (int)NumElems) { 3532 unsigned Opc = V2.getOpcode(); 3533 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3534 continue; 3535 if (Opc != ISD::BUILD_VECTOR || 3536 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3537 return false; 3538 } else if (Idx >= 0) { 3539 unsigned Opc = V1.getOpcode(); 3540 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3541 continue; 3542 if (Opc != ISD::BUILD_VECTOR || 3543 !X86::isZeroNode(V1.getOperand(Idx))) 3544 return false; 3545 } 3546 } 3547 return true; 3548} 3549 3550/// getZeroVector - Returns a vector of specified type with all zero elements. 3551/// 3552static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3553 DebugLoc dl) { 3554 assert(VT.isVector() && "Expected a vector type"); 3555 3556 // Always build SSE zero vectors as <4 x i32> bitcasted 3557 // to their dest type. This ensures they get CSE'd. 3558 SDValue Vec; 3559 if (VT.getSizeInBits() == 128) { // SSE 3560 if (HasSSE2) { // SSE2 3561 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3562 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3563 } else { // SSE1 3564 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3565 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3566 } 3567 } else if (VT.getSizeInBits() == 256) { // AVX 3568 // 256-bit logic and arithmetic instructions in AVX are 3569 // all floating-point, no support for integer ops. Default 3570 // to emitting fp zeroed vectors then. 3571 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3572 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3573 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3574 } 3575 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3576} 3577 3578/// getOnesVector - Returns a vector of specified type with all bits set. 3579/// 3580static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3581 assert(VT.isVector() && "Expected a vector type"); 3582 3583 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3584 // type. This ensures they get CSE'd. 3585 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3586 SDValue Vec; 3587 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3588 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3589} 3590 3591 3592/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3593/// that point to V2 points to its first element. 3594static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3595 EVT VT = SVOp->getValueType(0); 3596 unsigned NumElems = VT.getVectorNumElements(); 3597 3598 bool Changed = false; 3599 SmallVector<int, 8> MaskVec; 3600 SVOp->getMask(MaskVec); 3601 3602 for (unsigned i = 0; i != NumElems; ++i) { 3603 if (MaskVec[i] > (int)NumElems) { 3604 MaskVec[i] = NumElems; 3605 Changed = true; 3606 } 3607 } 3608 if (Changed) 3609 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3610 SVOp->getOperand(1), &MaskVec[0]); 3611 return SDValue(SVOp, 0); 3612} 3613 3614/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3615/// operation of specified width. 3616static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3617 SDValue V2) { 3618 unsigned NumElems = VT.getVectorNumElements(); 3619 SmallVector<int, 8> Mask; 3620 Mask.push_back(NumElems); 3621 for (unsigned i = 1; i != NumElems; ++i) 3622 Mask.push_back(i); 3623 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3624} 3625 3626/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3627static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3628 SDValue V2) { 3629 unsigned NumElems = VT.getVectorNumElements(); 3630 SmallVector<int, 8> Mask; 3631 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3632 Mask.push_back(i); 3633 Mask.push_back(i + NumElems); 3634 } 3635 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3636} 3637 3638/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3639static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3640 SDValue V2) { 3641 unsigned NumElems = VT.getVectorNumElements(); 3642 unsigned Half = NumElems/2; 3643 SmallVector<int, 8> Mask; 3644 for (unsigned i = 0; i != Half; ++i) { 3645 Mask.push_back(i + Half); 3646 Mask.push_back(i + NumElems + Half); 3647 } 3648 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3649} 3650 3651/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3652static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3653 EVT PVT = MVT::v4f32; 3654 EVT VT = SV->getValueType(0); 3655 DebugLoc dl = SV->getDebugLoc(); 3656 SDValue V1 = SV->getOperand(0); 3657 int NumElems = VT.getVectorNumElements(); 3658 int EltNo = SV->getSplatIndex(); 3659 3660 // unpack elements to the correct location 3661 while (NumElems > 4) { 3662 if (EltNo < NumElems/2) { 3663 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3664 } else { 3665 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3666 EltNo -= NumElems/2; 3667 } 3668 NumElems >>= 1; 3669 } 3670 3671 // Perform the splat. 3672 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3673 V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1); 3674 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3675 return DAG.getNode(ISD::BITCAST, dl, VT, V1); 3676} 3677 3678/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3679/// vector of zero or undef vector. This produces a shuffle where the low 3680/// element of V2 is swizzled into the zero/undef vector, landing at element 3681/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3682static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3683 bool isZero, bool HasSSE2, 3684 SelectionDAG &DAG) { 3685 EVT VT = V2.getValueType(); 3686 SDValue V1 = isZero 3687 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3688 unsigned NumElems = VT.getVectorNumElements(); 3689 SmallVector<int, 16> MaskVec; 3690 for (unsigned i = 0; i != NumElems; ++i) 3691 // If this is the insertion idx, put the low elt of V2 here. 3692 MaskVec.push_back(i == Idx ? NumElems : i); 3693 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3694} 3695 3696/// getShuffleScalarElt - Returns the scalar element that will make up the ith 3697/// element of the result of the vector shuffle. 3698SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 3699 unsigned Depth) { 3700 if (Depth == 6) 3701 return SDValue(); // Limit search depth. 3702 3703 SDValue V = SDValue(N, 0); 3704 EVT VT = V.getValueType(); 3705 unsigned Opcode = V.getOpcode(); 3706 3707 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 3708 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 3709 Index = SV->getMaskElt(Index); 3710 3711 if (Index < 0) 3712 return DAG.getUNDEF(VT.getVectorElementType()); 3713 3714 int NumElems = VT.getVectorNumElements(); 3715 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 3716 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 3717 } 3718 3719 // Recurse into target specific vector shuffles to find scalars. 3720 if (isTargetShuffle(Opcode)) { 3721 int NumElems = VT.getVectorNumElements(); 3722 SmallVector<unsigned, 16> ShuffleMask; 3723 SDValue ImmN; 3724 3725 switch(Opcode) { 3726 case X86ISD::SHUFPS: 3727 case X86ISD::SHUFPD: 3728 ImmN = N->getOperand(N->getNumOperands()-1); 3729 DecodeSHUFPSMask(NumElems, 3730 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3731 ShuffleMask); 3732 break; 3733 case X86ISD::PUNPCKHBW: 3734 case X86ISD::PUNPCKHWD: 3735 case X86ISD::PUNPCKHDQ: 3736 case X86ISD::PUNPCKHQDQ: 3737 DecodePUNPCKHMask(NumElems, ShuffleMask); 3738 break; 3739 case X86ISD::UNPCKHPS: 3740 case X86ISD::UNPCKHPD: 3741 DecodeUNPCKHPMask(NumElems, ShuffleMask); 3742 break; 3743 case X86ISD::PUNPCKLBW: 3744 case X86ISD::PUNPCKLWD: 3745 case X86ISD::PUNPCKLDQ: 3746 case X86ISD::PUNPCKLQDQ: 3747 DecodePUNPCKLMask(NumElems, ShuffleMask); 3748 break; 3749 case X86ISD::UNPCKLPS: 3750 case X86ISD::UNPCKLPD: 3751 DecodeUNPCKLPMask(NumElems, ShuffleMask); 3752 break; 3753 case X86ISD::MOVHLPS: 3754 DecodeMOVHLPSMask(NumElems, ShuffleMask); 3755 break; 3756 case X86ISD::MOVLHPS: 3757 DecodeMOVLHPSMask(NumElems, ShuffleMask); 3758 break; 3759 case X86ISD::PSHUFD: 3760 ImmN = N->getOperand(N->getNumOperands()-1); 3761 DecodePSHUFMask(NumElems, 3762 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3763 ShuffleMask); 3764 break; 3765 case X86ISD::PSHUFHW: 3766 ImmN = N->getOperand(N->getNumOperands()-1); 3767 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3768 ShuffleMask); 3769 break; 3770 case X86ISD::PSHUFLW: 3771 ImmN = N->getOperand(N->getNumOperands()-1); 3772 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3773 ShuffleMask); 3774 break; 3775 case X86ISD::MOVSS: 3776 case X86ISD::MOVSD: { 3777 // The index 0 always comes from the first element of the second source, 3778 // this is why MOVSS and MOVSD are used in the first place. The other 3779 // elements come from the other positions of the first source vector. 3780 unsigned OpNum = (Index == 0) ? 1 : 0; 3781 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 3782 Depth+1); 3783 } 3784 default: 3785 assert("not implemented for target shuffle node"); 3786 return SDValue(); 3787 } 3788 3789 Index = ShuffleMask[Index]; 3790 if (Index < 0) 3791 return DAG.getUNDEF(VT.getVectorElementType()); 3792 3793 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 3794 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 3795 Depth+1); 3796 } 3797 3798 // Actual nodes that may contain scalar elements 3799 if (Opcode == ISD::BITCAST) { 3800 V = V.getOperand(0); 3801 EVT SrcVT = V.getValueType(); 3802 unsigned NumElems = VT.getVectorNumElements(); 3803 3804 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 3805 return SDValue(); 3806 } 3807 3808 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 3809 return (Index == 0) ? V.getOperand(0) 3810 : DAG.getUNDEF(VT.getVectorElementType()); 3811 3812 if (V.getOpcode() == ISD::BUILD_VECTOR) 3813 return V.getOperand(Index); 3814 3815 return SDValue(); 3816} 3817 3818/// getNumOfConsecutiveZeros - Return the number of elements of a vector 3819/// shuffle operation which come from a consecutively from a zero. The 3820/// search can start in two diferent directions, from left or right. 3821static 3822unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 3823 bool ZerosFromLeft, SelectionDAG &DAG) { 3824 int i = 0; 3825 3826 while (i < NumElems) { 3827 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 3828 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 3829 if (!(Elt.getNode() && 3830 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 3831 break; 3832 ++i; 3833 } 3834 3835 return i; 3836} 3837 3838/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 3839/// MaskE correspond consecutively to elements from one of the vector operands, 3840/// starting from its index OpIdx. Also tell OpNum which source vector operand. 3841static 3842bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 3843 int OpIdx, int NumElems, unsigned &OpNum) { 3844 bool SeenV1 = false; 3845 bool SeenV2 = false; 3846 3847 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 3848 int Idx = SVOp->getMaskElt(i); 3849 // Ignore undef indicies 3850 if (Idx < 0) 3851 continue; 3852 3853 if (Idx < NumElems) 3854 SeenV1 = true; 3855 else 3856 SeenV2 = true; 3857 3858 // Only accept consecutive elements from the same vector 3859 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 3860 return false; 3861 } 3862 3863 OpNum = SeenV1 ? 0 : 1; 3864 return true; 3865} 3866 3867/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 3868/// logical left shift of a vector. 3869static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3870 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3871 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3872 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3873 false /* check zeros from right */, DAG); 3874 unsigned OpSrc; 3875 3876 if (!NumZeros) 3877 return false; 3878 3879 // Considering the elements in the mask that are not consecutive zeros, 3880 // check if they consecutively come from only one of the source vectors. 3881 // 3882 // V1 = {X, A, B, C} 0 3883 // \ \ \ / 3884 // vector_shuffle V1, V2 <1, 2, 3, X> 3885 // 3886 if (!isShuffleMaskConsecutive(SVOp, 3887 0, // Mask Start Index 3888 NumElems-NumZeros-1, // Mask End Index 3889 NumZeros, // Where to start looking in the src vector 3890 NumElems, // Number of elements in vector 3891 OpSrc)) // Which source operand ? 3892 return false; 3893 3894 isLeft = false; 3895 ShAmt = NumZeros; 3896 ShVal = SVOp->getOperand(OpSrc); 3897 return true; 3898} 3899 3900/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 3901/// logical left shift of a vector. 3902static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3903 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3904 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3905 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3906 true /* check zeros from left */, DAG); 3907 unsigned OpSrc; 3908 3909 if (!NumZeros) 3910 return false; 3911 3912 // Considering the elements in the mask that are not consecutive zeros, 3913 // check if they consecutively come from only one of the source vectors. 3914 // 3915 // 0 { A, B, X, X } = V2 3916 // / \ / / 3917 // vector_shuffle V1, V2 <X, X, 4, 5> 3918 // 3919 if (!isShuffleMaskConsecutive(SVOp, 3920 NumZeros, // Mask Start Index 3921 NumElems-1, // Mask End Index 3922 0, // Where to start looking in the src vector 3923 NumElems, // Number of elements in vector 3924 OpSrc)) // Which source operand ? 3925 return false; 3926 3927 isLeft = true; 3928 ShAmt = NumZeros; 3929 ShVal = SVOp->getOperand(OpSrc); 3930 return true; 3931} 3932 3933/// isVectorShift - Returns true if the shuffle can be implemented as a 3934/// logical left or right shift of a vector. 3935static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3936 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3937 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 3938 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 3939 return true; 3940 3941 return false; 3942} 3943 3944/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3945/// 3946static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3947 unsigned NumNonZero, unsigned NumZero, 3948 SelectionDAG &DAG, 3949 const TargetLowering &TLI) { 3950 if (NumNonZero > 8) 3951 return SDValue(); 3952 3953 DebugLoc dl = Op.getDebugLoc(); 3954 SDValue V(0, 0); 3955 bool First = true; 3956 for (unsigned i = 0; i < 16; ++i) { 3957 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3958 if (ThisIsNonZero && First) { 3959 if (NumZero) 3960 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3961 else 3962 V = DAG.getUNDEF(MVT::v8i16); 3963 First = false; 3964 } 3965 3966 if ((i & 1) != 0) { 3967 SDValue ThisElt(0, 0), LastElt(0, 0); 3968 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3969 if (LastIsNonZero) { 3970 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3971 MVT::i16, Op.getOperand(i-1)); 3972 } 3973 if (ThisIsNonZero) { 3974 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3975 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3976 ThisElt, DAG.getConstant(8, MVT::i8)); 3977 if (LastIsNonZero) 3978 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3979 } else 3980 ThisElt = LastElt; 3981 3982 if (ThisElt.getNode()) 3983 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3984 DAG.getIntPtrConstant(i/2)); 3985 } 3986 } 3987 3988 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 3989} 3990 3991/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3992/// 3993static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3994 unsigned NumNonZero, unsigned NumZero, 3995 SelectionDAG &DAG, 3996 const TargetLowering &TLI) { 3997 if (NumNonZero > 4) 3998 return SDValue(); 3999 4000 DebugLoc dl = Op.getDebugLoc(); 4001 SDValue V(0, 0); 4002 bool First = true; 4003 for (unsigned i = 0; i < 8; ++i) { 4004 bool isNonZero = (NonZeros & (1 << i)) != 0; 4005 if (isNonZero) { 4006 if (First) { 4007 if (NumZero) 4008 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4009 else 4010 V = DAG.getUNDEF(MVT::v8i16); 4011 First = false; 4012 } 4013 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4014 MVT::v8i16, V, Op.getOperand(i), 4015 DAG.getIntPtrConstant(i)); 4016 } 4017 } 4018 4019 return V; 4020} 4021 4022/// getVShift - Return a vector logical shift node. 4023/// 4024static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4025 unsigned NumBits, SelectionDAG &DAG, 4026 const TargetLowering &TLI, DebugLoc dl) { 4027 EVT ShVT = MVT::v2i64; 4028 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 4029 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4030 return DAG.getNode(ISD::BITCAST, dl, VT, 4031 DAG.getNode(Opc, dl, ShVT, SrcOp, 4032 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 4033} 4034 4035SDValue 4036X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4037 SelectionDAG &DAG) const { 4038 4039 // Check if the scalar load can be widened into a vector load. And if 4040 // the address is "base + cst" see if the cst can be "absorbed" into 4041 // the shuffle mask. 4042 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4043 SDValue Ptr = LD->getBasePtr(); 4044 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4045 return SDValue(); 4046 EVT PVT = LD->getValueType(0); 4047 if (PVT != MVT::i32 && PVT != MVT::f32) 4048 return SDValue(); 4049 4050 int FI = -1; 4051 int64_t Offset = 0; 4052 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4053 FI = FINode->getIndex(); 4054 Offset = 0; 4055 } else if (Ptr.getOpcode() == ISD::ADD && 4056 isa<ConstantSDNode>(Ptr.getOperand(1)) && 4057 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4058 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4059 Offset = Ptr.getConstantOperandVal(1); 4060 Ptr = Ptr.getOperand(0); 4061 } else { 4062 return SDValue(); 4063 } 4064 4065 SDValue Chain = LD->getChain(); 4066 // Make sure the stack object alignment is at least 16. 4067 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4068 if (DAG.InferPtrAlignment(Ptr) < 16) { 4069 if (MFI->isFixedObjectIndex(FI)) { 4070 // Can't change the alignment. FIXME: It's possible to compute 4071 // the exact stack offset and reference FI + adjust offset instead. 4072 // If someone *really* cares about this. That's the way to implement it. 4073 return SDValue(); 4074 } else { 4075 MFI->setObjectAlignment(FI, 16); 4076 } 4077 } 4078 4079 // (Offset % 16) must be multiple of 4. Then address is then 4080 // Ptr + (Offset & ~15). 4081 if (Offset < 0) 4082 return SDValue(); 4083 if ((Offset % 16) & 3) 4084 return SDValue(); 4085 int64_t StartOffset = Offset & ~15; 4086 if (StartOffset) 4087 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4088 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4089 4090 int EltNo = (Offset - StartOffset) >> 2; 4091 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 4092 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 4093 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, 4094 LD->getPointerInfo().getWithOffset(StartOffset), 4095 false, false, 0); 4096 // Canonicalize it to a v4i32 shuffle. 4097 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 4098 return DAG.getNode(ISD::BITCAST, dl, VT, 4099 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 4100 DAG.getUNDEF(MVT::v4i32),&Mask[0])); 4101 } 4102 4103 return SDValue(); 4104} 4105 4106/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4107/// vector of type 'VT', see if the elements can be replaced by a single large 4108/// load which has the same value as a build_vector whose operands are 'elts'. 4109/// 4110/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4111/// 4112/// FIXME: we'd also like to handle the case where the last elements are zero 4113/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4114/// There's even a handy isZeroNode for that purpose. 4115static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4116 DebugLoc &DL, SelectionDAG &DAG) { 4117 EVT EltVT = VT.getVectorElementType(); 4118 unsigned NumElems = Elts.size(); 4119 4120 LoadSDNode *LDBase = NULL; 4121 unsigned LastLoadedElt = -1U; 4122 4123 // For each element in the initializer, see if we've found a load or an undef. 4124 // If we don't find an initial load element, or later load elements are 4125 // non-consecutive, bail out. 4126 for (unsigned i = 0; i < NumElems; ++i) { 4127 SDValue Elt = Elts[i]; 4128 4129 if (!Elt.getNode() || 4130 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4131 return SDValue(); 4132 if (!LDBase) { 4133 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4134 return SDValue(); 4135 LDBase = cast<LoadSDNode>(Elt.getNode()); 4136 LastLoadedElt = i; 4137 continue; 4138 } 4139 if (Elt.getOpcode() == ISD::UNDEF) 4140 continue; 4141 4142 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4143 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4144 return SDValue(); 4145 LastLoadedElt = i; 4146 } 4147 4148 // If we have found an entire vector of loads and undefs, then return a large 4149 // load of the entire vector width starting at the base pointer. If we found 4150 // consecutive loads for the low half, generate a vzext_load node. 4151 if (LastLoadedElt == NumElems - 1) { 4152 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4153 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4154 LDBase->getPointerInfo(), 4155 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4156 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4157 LDBase->getPointerInfo(), 4158 LDBase->isVolatile(), LDBase->isNonTemporal(), 4159 LDBase->getAlignment()); 4160 } else if (NumElems == 4 && LastLoadedElt == 1) { 4161 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4162 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4163 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4164 Ops, 2, MVT::i32, 4165 LDBase->getMemOperand()); 4166 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4167 } 4168 return SDValue(); 4169} 4170 4171SDValue 4172X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4173 DebugLoc dl = Op.getDebugLoc(); 4174 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1. 4175 // All one's are handled with pcmpeqd. In AVX, zero's are handled with 4176 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 4177 // is present, so AllOnes is ignored. 4178 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 4179 (Op.getValueType().getSizeInBits() != 256 && 4180 ISD::isBuildVectorAllOnes(Op.getNode()))) { 4181 // Canonicalize this to <4 x i32> (SSE) to 4182 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 4183 // eliminated on x86-32 hosts. 4184 if (Op.getValueType() == MVT::v4i32) 4185 return Op; 4186 4187 if (ISD::isBuildVectorAllOnes(Op.getNode())) 4188 return getOnesVector(Op.getValueType(), DAG, dl); 4189 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4190 } 4191 4192 EVT VT = Op.getValueType(); 4193 EVT ExtVT = VT.getVectorElementType(); 4194 unsigned EVTBits = ExtVT.getSizeInBits(); 4195 4196 unsigned NumElems = Op.getNumOperands(); 4197 unsigned NumZero = 0; 4198 unsigned NumNonZero = 0; 4199 unsigned NonZeros = 0; 4200 bool IsAllConstants = true; 4201 SmallSet<SDValue, 8> Values; 4202 for (unsigned i = 0; i < NumElems; ++i) { 4203 SDValue Elt = Op.getOperand(i); 4204 if (Elt.getOpcode() == ISD::UNDEF) 4205 continue; 4206 Values.insert(Elt); 4207 if (Elt.getOpcode() != ISD::Constant && 4208 Elt.getOpcode() != ISD::ConstantFP) 4209 IsAllConstants = false; 4210 if (X86::isZeroNode(Elt)) 4211 NumZero++; 4212 else { 4213 NonZeros |= (1 << i); 4214 NumNonZero++; 4215 } 4216 } 4217 4218 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4219 if (NumNonZero == 0) 4220 return DAG.getUNDEF(VT); 4221 4222 // Special case for single non-zero, non-undef, element. 4223 if (NumNonZero == 1) { 4224 unsigned Idx = CountTrailingZeros_32(NonZeros); 4225 SDValue Item = Op.getOperand(Idx); 4226 4227 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4228 // the value are obviously zero, truncate the value to i32 and do the 4229 // insertion that way. Only do this if the value is non-constant or if the 4230 // value is a constant being inserted into element 0. It is cheaper to do 4231 // a constant pool load than it is to do a movd + shuffle. 4232 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4233 (!IsAllConstants || Idx == 0)) { 4234 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4235 // Handle SSE only. 4236 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4237 EVT VecVT = MVT::v4i32; 4238 unsigned VecElts = 4; 4239 4240 // Truncate the value (which may itself be a constant) to i32, and 4241 // convert it to a vector with movd (S2V+shuffle to zero extend). 4242 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4243 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4244 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4245 Subtarget->hasSSE2(), DAG); 4246 4247 // Now we have our 32-bit value zero extended in the low element of 4248 // a vector. If Idx != 0, swizzle it into place. 4249 if (Idx != 0) { 4250 SmallVector<int, 4> Mask; 4251 Mask.push_back(Idx); 4252 for (unsigned i = 1; i != VecElts; ++i) 4253 Mask.push_back(i); 4254 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4255 DAG.getUNDEF(Item.getValueType()), 4256 &Mask[0]); 4257 } 4258 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 4259 } 4260 } 4261 4262 // If we have a constant or non-constant insertion into the low element of 4263 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4264 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4265 // depending on what the source datatype is. 4266 if (Idx == 0) { 4267 if (NumZero == 0) { 4268 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4269 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4270 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4271 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4272 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4273 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4274 DAG); 4275 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4276 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4277 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4278 EVT MiddleVT = MVT::v4i32; 4279 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4280 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4281 Subtarget->hasSSE2(), DAG); 4282 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 4283 } 4284 } 4285 4286 // Is it a vector logical left shift? 4287 if (NumElems == 2 && Idx == 1 && 4288 X86::isZeroNode(Op.getOperand(0)) && 4289 !X86::isZeroNode(Op.getOperand(1))) { 4290 unsigned NumBits = VT.getSizeInBits(); 4291 return getVShift(true, VT, 4292 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4293 VT, Op.getOperand(1)), 4294 NumBits/2, DAG, *this, dl); 4295 } 4296 4297 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4298 return SDValue(); 4299 4300 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4301 // is a non-constant being inserted into an element other than the low one, 4302 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4303 // movd/movss) to move this into the low element, then shuffle it into 4304 // place. 4305 if (EVTBits == 32) { 4306 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4307 4308 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4309 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4310 Subtarget->hasSSE2(), DAG); 4311 SmallVector<int, 8> MaskVec; 4312 for (unsigned i = 0; i < NumElems; i++) 4313 MaskVec.push_back(i == Idx ? 0 : 1); 4314 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4315 } 4316 } 4317 4318 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4319 if (Values.size() == 1) { 4320 if (EVTBits == 32) { 4321 // Instead of a shuffle like this: 4322 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4323 // Check if it's possible to issue this instead. 4324 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4325 unsigned Idx = CountTrailingZeros_32(NonZeros); 4326 SDValue Item = Op.getOperand(Idx); 4327 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4328 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4329 } 4330 return SDValue(); 4331 } 4332 4333 // A vector full of immediates; various special cases are already 4334 // handled, so this is best done with a single constant-pool load. 4335 if (IsAllConstants) 4336 return SDValue(); 4337 4338 // Let legalizer expand 2-wide build_vectors. 4339 if (EVTBits == 64) { 4340 if (NumNonZero == 1) { 4341 // One half is zero or undef. 4342 unsigned Idx = CountTrailingZeros_32(NonZeros); 4343 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4344 Op.getOperand(Idx)); 4345 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4346 Subtarget->hasSSE2(), DAG); 4347 } 4348 return SDValue(); 4349 } 4350 4351 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4352 if (EVTBits == 8 && NumElems == 16) { 4353 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4354 *this); 4355 if (V.getNode()) return V; 4356 } 4357 4358 if (EVTBits == 16 && NumElems == 8) { 4359 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4360 *this); 4361 if (V.getNode()) return V; 4362 } 4363 4364 // If element VT is == 32 bits, turn it into a number of shuffles. 4365 SmallVector<SDValue, 8> V; 4366 V.resize(NumElems); 4367 if (NumElems == 4 && NumZero > 0) { 4368 for (unsigned i = 0; i < 4; ++i) { 4369 bool isZero = !(NonZeros & (1 << i)); 4370 if (isZero) 4371 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4372 else 4373 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4374 } 4375 4376 for (unsigned i = 0; i < 2; ++i) { 4377 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4378 default: break; 4379 case 0: 4380 V[i] = V[i*2]; // Must be a zero vector. 4381 break; 4382 case 1: 4383 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4384 break; 4385 case 2: 4386 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4387 break; 4388 case 3: 4389 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4390 break; 4391 } 4392 } 4393 4394 SmallVector<int, 8> MaskVec; 4395 bool Reverse = (NonZeros & 0x3) == 2; 4396 for (unsigned i = 0; i < 2; ++i) 4397 MaskVec.push_back(Reverse ? 1-i : i); 4398 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4399 for (unsigned i = 0; i < 2; ++i) 4400 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4401 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4402 } 4403 4404 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4405 // Check for a build vector of consecutive loads. 4406 for (unsigned i = 0; i < NumElems; ++i) 4407 V[i] = Op.getOperand(i); 4408 4409 // Check for elements which are consecutive loads. 4410 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4411 if (LD.getNode()) 4412 return LD; 4413 4414 // For SSE 4.1, use insertps to put the high elements into the low element. 4415 if (getSubtarget()->hasSSE41()) { 4416 SDValue Result; 4417 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4418 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4419 else 4420 Result = DAG.getUNDEF(VT); 4421 4422 for (unsigned i = 1; i < NumElems; ++i) { 4423 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4424 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4425 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4426 } 4427 return Result; 4428 } 4429 4430 // Otherwise, expand into a number of unpckl*, start by extending each of 4431 // our (non-undef) elements to the full vector width with the element in the 4432 // bottom slot of the vector (which generates no code for SSE). 4433 for (unsigned i = 0; i < NumElems; ++i) { 4434 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4435 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4436 else 4437 V[i] = DAG.getUNDEF(VT); 4438 } 4439 4440 // Next, we iteratively mix elements, e.g. for v4f32: 4441 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4442 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4443 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4444 unsigned EltStride = NumElems >> 1; 4445 while (EltStride != 0) { 4446 for (unsigned i = 0; i < EltStride; ++i) { 4447 // If V[i+EltStride] is undef and this is the first round of mixing, 4448 // then it is safe to just drop this shuffle: V[i] is already in the 4449 // right place, the one element (since it's the first round) being 4450 // inserted as undef can be dropped. This isn't safe for successive 4451 // rounds because they will permute elements within both vectors. 4452 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 4453 EltStride == NumElems/2) 4454 continue; 4455 4456 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 4457 } 4458 EltStride >>= 1; 4459 } 4460 return V[0]; 4461 } 4462 return SDValue(); 4463} 4464 4465SDValue 4466X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4467 // We support concatenate two MMX registers and place them in a MMX 4468 // register. This is better than doing a stack convert. 4469 DebugLoc dl = Op.getDebugLoc(); 4470 EVT ResVT = Op.getValueType(); 4471 assert(Op.getNumOperands() == 2); 4472 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4473 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4474 int Mask[2]; 4475 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 4476 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4477 InVec = Op.getOperand(1); 4478 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4479 unsigned NumElts = ResVT.getVectorNumElements(); 4480 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4481 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4482 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4483 } else { 4484 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 4485 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4486 Mask[0] = 0; Mask[1] = 2; 4487 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4488 } 4489 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4490} 4491 4492// v8i16 shuffles - Prefer shuffles in the following order: 4493// 1. [all] pshuflw, pshufhw, optional move 4494// 2. [ssse3] 1 x pshufb 4495// 3. [ssse3] 2 x pshufb + 1 x por 4496// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4497SDValue 4498X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4499 SelectionDAG &DAG) const { 4500 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4501 SDValue V1 = SVOp->getOperand(0); 4502 SDValue V2 = SVOp->getOperand(1); 4503 DebugLoc dl = SVOp->getDebugLoc(); 4504 SmallVector<int, 8> MaskVals; 4505 4506 // Determine if more than 1 of the words in each of the low and high quadwords 4507 // of the result come from the same quadword of one of the two inputs. Undef 4508 // mask values count as coming from any quadword, for better codegen. 4509 SmallVector<unsigned, 4> LoQuad(4); 4510 SmallVector<unsigned, 4> HiQuad(4); 4511 BitVector InputQuads(4); 4512 for (unsigned i = 0; i < 8; ++i) { 4513 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4514 int EltIdx = SVOp->getMaskElt(i); 4515 MaskVals.push_back(EltIdx); 4516 if (EltIdx < 0) { 4517 ++Quad[0]; 4518 ++Quad[1]; 4519 ++Quad[2]; 4520 ++Quad[3]; 4521 continue; 4522 } 4523 ++Quad[EltIdx / 4]; 4524 InputQuads.set(EltIdx / 4); 4525 } 4526 4527 int BestLoQuad = -1; 4528 unsigned MaxQuad = 1; 4529 for (unsigned i = 0; i < 4; ++i) { 4530 if (LoQuad[i] > MaxQuad) { 4531 BestLoQuad = i; 4532 MaxQuad = LoQuad[i]; 4533 } 4534 } 4535 4536 int BestHiQuad = -1; 4537 MaxQuad = 1; 4538 for (unsigned i = 0; i < 4; ++i) { 4539 if (HiQuad[i] > MaxQuad) { 4540 BestHiQuad = i; 4541 MaxQuad = HiQuad[i]; 4542 } 4543 } 4544 4545 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4546 // of the two input vectors, shuffle them into one input vector so only a 4547 // single pshufb instruction is necessary. If There are more than 2 input 4548 // quads, disable the next transformation since it does not help SSSE3. 4549 bool V1Used = InputQuads[0] || InputQuads[1]; 4550 bool V2Used = InputQuads[2] || InputQuads[3]; 4551 if (Subtarget->hasSSSE3()) { 4552 if (InputQuads.count() == 2 && V1Used && V2Used) { 4553 BestLoQuad = InputQuads.find_first(); 4554 BestHiQuad = InputQuads.find_next(BestLoQuad); 4555 } 4556 if (InputQuads.count() > 2) { 4557 BestLoQuad = -1; 4558 BestHiQuad = -1; 4559 } 4560 } 4561 4562 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4563 // the shuffle mask. If a quad is scored as -1, that means that it contains 4564 // words from all 4 input quadwords. 4565 SDValue NewV; 4566 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4567 SmallVector<int, 8> MaskV; 4568 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4569 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4570 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4571 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 4572 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 4573 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 4574 4575 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4576 // source words for the shuffle, to aid later transformations. 4577 bool AllWordsInNewV = true; 4578 bool InOrder[2] = { true, true }; 4579 for (unsigned i = 0; i != 8; ++i) { 4580 int idx = MaskVals[i]; 4581 if (idx != (int)i) 4582 InOrder[i/4] = false; 4583 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4584 continue; 4585 AllWordsInNewV = false; 4586 break; 4587 } 4588 4589 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4590 if (AllWordsInNewV) { 4591 for (int i = 0; i != 8; ++i) { 4592 int idx = MaskVals[i]; 4593 if (idx < 0) 4594 continue; 4595 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4596 if ((idx != i) && idx < 4) 4597 pshufhw = false; 4598 if ((idx != i) && idx > 3) 4599 pshuflw = false; 4600 } 4601 V1 = NewV; 4602 V2Used = false; 4603 BestLoQuad = 0; 4604 BestHiQuad = 1; 4605 } 4606 4607 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4608 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4609 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4610 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4611 unsigned TargetMask = 0; 4612 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4613 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4614 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4615 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4616 V1 = NewV.getOperand(0); 4617 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 4618 } 4619 } 4620 4621 // If we have SSSE3, and all words of the result are from 1 input vector, 4622 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4623 // is present, fall back to case 4. 4624 if (Subtarget->hasSSSE3()) { 4625 SmallVector<SDValue,16> pshufbMask; 4626 4627 // If we have elements from both input vectors, set the high bit of the 4628 // shuffle mask element to zero out elements that come from V2 in the V1 4629 // mask, and elements that come from V1 in the V2 mask, so that the two 4630 // results can be OR'd together. 4631 bool TwoInputs = V1Used && V2Used; 4632 for (unsigned i = 0; i != 8; ++i) { 4633 int EltIdx = MaskVals[i] * 2; 4634 if (TwoInputs && (EltIdx >= 16)) { 4635 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4636 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4637 continue; 4638 } 4639 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4640 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4641 } 4642 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 4643 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4644 DAG.getNode(ISD::BUILD_VECTOR, dl, 4645 MVT::v16i8, &pshufbMask[0], 16)); 4646 if (!TwoInputs) 4647 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4648 4649 // Calculate the shuffle mask for the second input, shuffle it, and 4650 // OR it with the first shuffled input. 4651 pshufbMask.clear(); 4652 for (unsigned i = 0; i != 8; ++i) { 4653 int EltIdx = MaskVals[i] * 2; 4654 if (EltIdx < 16) { 4655 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4656 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4657 continue; 4658 } 4659 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4660 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4661 } 4662 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 4663 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4664 DAG.getNode(ISD::BUILD_VECTOR, dl, 4665 MVT::v16i8, &pshufbMask[0], 16)); 4666 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4667 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4668 } 4669 4670 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4671 // and update MaskVals with new element order. 4672 BitVector InOrder(8); 4673 if (BestLoQuad >= 0) { 4674 SmallVector<int, 8> MaskV; 4675 for (int i = 0; i != 4; ++i) { 4676 int idx = MaskVals[i]; 4677 if (idx < 0) { 4678 MaskV.push_back(-1); 4679 InOrder.set(i); 4680 } else if ((idx / 4) == BestLoQuad) { 4681 MaskV.push_back(idx & 3); 4682 InOrder.set(i); 4683 } else { 4684 MaskV.push_back(-1); 4685 } 4686 } 4687 for (unsigned i = 4; i != 8; ++i) 4688 MaskV.push_back(i); 4689 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4690 &MaskV[0]); 4691 4692 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4693 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 4694 NewV.getOperand(0), 4695 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 4696 DAG); 4697 } 4698 4699 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4700 // and update MaskVals with the new element order. 4701 if (BestHiQuad >= 0) { 4702 SmallVector<int, 8> MaskV; 4703 for (unsigned i = 0; i != 4; ++i) 4704 MaskV.push_back(i); 4705 for (unsigned i = 4; i != 8; ++i) { 4706 int idx = MaskVals[i]; 4707 if (idx < 0) { 4708 MaskV.push_back(-1); 4709 InOrder.set(i); 4710 } else if ((idx / 4) == BestHiQuad) { 4711 MaskV.push_back((idx & 3) + 4); 4712 InOrder.set(i); 4713 } else { 4714 MaskV.push_back(-1); 4715 } 4716 } 4717 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4718 &MaskV[0]); 4719 4720 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4721 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 4722 NewV.getOperand(0), 4723 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 4724 DAG); 4725 } 4726 4727 // In case BestHi & BestLo were both -1, which means each quadword has a word 4728 // from each of the four input quadwords, calculate the InOrder bitvector now 4729 // before falling through to the insert/extract cleanup. 4730 if (BestLoQuad == -1 && BestHiQuad == -1) { 4731 NewV = V1; 4732 for (int i = 0; i != 8; ++i) 4733 if (MaskVals[i] < 0 || MaskVals[i] == i) 4734 InOrder.set(i); 4735 } 4736 4737 // The other elements are put in the right place using pextrw and pinsrw. 4738 for (unsigned i = 0; i != 8; ++i) { 4739 if (InOrder[i]) 4740 continue; 4741 int EltIdx = MaskVals[i]; 4742 if (EltIdx < 0) 4743 continue; 4744 SDValue ExtOp = (EltIdx < 8) 4745 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4746 DAG.getIntPtrConstant(EltIdx)) 4747 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4748 DAG.getIntPtrConstant(EltIdx - 8)); 4749 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4750 DAG.getIntPtrConstant(i)); 4751 } 4752 return NewV; 4753} 4754 4755// v16i8 shuffles - Prefer shuffles in the following order: 4756// 1. [ssse3] 1 x pshufb 4757// 2. [ssse3] 2 x pshufb + 1 x por 4758// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4759static 4760SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4761 SelectionDAG &DAG, 4762 const X86TargetLowering &TLI) { 4763 SDValue V1 = SVOp->getOperand(0); 4764 SDValue V2 = SVOp->getOperand(1); 4765 DebugLoc dl = SVOp->getDebugLoc(); 4766 SmallVector<int, 16> MaskVals; 4767 SVOp->getMask(MaskVals); 4768 4769 // If we have SSSE3, case 1 is generated when all result bytes come from 4770 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4771 // present, fall back to case 3. 4772 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4773 bool V1Only = true; 4774 bool V2Only = true; 4775 for (unsigned i = 0; i < 16; ++i) { 4776 int EltIdx = MaskVals[i]; 4777 if (EltIdx < 0) 4778 continue; 4779 if (EltIdx < 16) 4780 V2Only = false; 4781 else 4782 V1Only = false; 4783 } 4784 4785 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4786 if (TLI.getSubtarget()->hasSSSE3()) { 4787 SmallVector<SDValue,16> pshufbMask; 4788 4789 // If all result elements are from one input vector, then only translate 4790 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4791 // 4792 // Otherwise, we have elements from both input vectors, and must zero out 4793 // elements that come from V2 in the first mask, and V1 in the second mask 4794 // so that we can OR them together. 4795 bool TwoInputs = !(V1Only || V2Only); 4796 for (unsigned i = 0; i != 16; ++i) { 4797 int EltIdx = MaskVals[i]; 4798 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4799 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4800 continue; 4801 } 4802 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4803 } 4804 // If all the elements are from V2, assign it to V1 and return after 4805 // building the first pshufb. 4806 if (V2Only) 4807 V1 = V2; 4808 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4809 DAG.getNode(ISD::BUILD_VECTOR, dl, 4810 MVT::v16i8, &pshufbMask[0], 16)); 4811 if (!TwoInputs) 4812 return V1; 4813 4814 // Calculate the shuffle mask for the second input, shuffle it, and 4815 // OR it with the first shuffled input. 4816 pshufbMask.clear(); 4817 for (unsigned i = 0; i != 16; ++i) { 4818 int EltIdx = MaskVals[i]; 4819 if (EltIdx < 16) { 4820 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4821 continue; 4822 } 4823 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4824 } 4825 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4826 DAG.getNode(ISD::BUILD_VECTOR, dl, 4827 MVT::v16i8, &pshufbMask[0], 16)); 4828 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4829 } 4830 4831 // No SSSE3 - Calculate in place words and then fix all out of place words 4832 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4833 // the 16 different words that comprise the two doublequadword input vectors. 4834 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4835 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 4836 SDValue NewV = V2Only ? V2 : V1; 4837 for (int i = 0; i != 8; ++i) { 4838 int Elt0 = MaskVals[i*2]; 4839 int Elt1 = MaskVals[i*2+1]; 4840 4841 // This word of the result is all undef, skip it. 4842 if (Elt0 < 0 && Elt1 < 0) 4843 continue; 4844 4845 // This word of the result is already in the correct place, skip it. 4846 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4847 continue; 4848 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4849 continue; 4850 4851 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4852 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4853 SDValue InsElt; 4854 4855 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4856 // using a single extract together, load it and store it. 4857 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4858 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4859 DAG.getIntPtrConstant(Elt1 / 2)); 4860 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4861 DAG.getIntPtrConstant(i)); 4862 continue; 4863 } 4864 4865 // If Elt1 is defined, extract it from the appropriate source. If the 4866 // source byte is not also odd, shift the extracted word left 8 bits 4867 // otherwise clear the bottom 8 bits if we need to do an or. 4868 if (Elt1 >= 0) { 4869 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4870 DAG.getIntPtrConstant(Elt1 / 2)); 4871 if ((Elt1 & 1) == 0) 4872 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4873 DAG.getConstant(8, TLI.getShiftAmountTy())); 4874 else if (Elt0 >= 0) 4875 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4876 DAG.getConstant(0xFF00, MVT::i16)); 4877 } 4878 // If Elt0 is defined, extract it from the appropriate source. If the 4879 // source byte is not also even, shift the extracted word right 8 bits. If 4880 // Elt1 was also defined, OR the extracted values together before 4881 // inserting them in the result. 4882 if (Elt0 >= 0) { 4883 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4884 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4885 if ((Elt0 & 1) != 0) 4886 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4887 DAG.getConstant(8, TLI.getShiftAmountTy())); 4888 else if (Elt1 >= 0) 4889 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4890 DAG.getConstant(0x00FF, MVT::i16)); 4891 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4892 : InsElt0; 4893 } 4894 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4895 DAG.getIntPtrConstant(i)); 4896 } 4897 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 4898} 4899 4900/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4901/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 4902/// done when every pair / quad of shuffle mask elements point to elements in 4903/// the right sequence. e.g. 4904/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 4905static 4906SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4907 SelectionDAG &DAG, DebugLoc dl) { 4908 EVT VT = SVOp->getValueType(0); 4909 SDValue V1 = SVOp->getOperand(0); 4910 SDValue V2 = SVOp->getOperand(1); 4911 unsigned NumElems = VT.getVectorNumElements(); 4912 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4913 EVT NewVT; 4914 switch (VT.getSimpleVT().SimpleTy) { 4915 default: assert(false && "Unexpected!"); 4916 case MVT::v4f32: NewVT = MVT::v2f64; break; 4917 case MVT::v4i32: NewVT = MVT::v2i64; break; 4918 case MVT::v8i16: NewVT = MVT::v4i32; break; 4919 case MVT::v16i8: NewVT = MVT::v4i32; break; 4920 } 4921 4922 int Scale = NumElems / NewWidth; 4923 SmallVector<int, 8> MaskVec; 4924 for (unsigned i = 0; i < NumElems; i += Scale) { 4925 int StartIdx = -1; 4926 for (int j = 0; j < Scale; ++j) { 4927 int EltIdx = SVOp->getMaskElt(i+j); 4928 if (EltIdx < 0) 4929 continue; 4930 if (StartIdx == -1) 4931 StartIdx = EltIdx - (EltIdx % Scale); 4932 if (EltIdx != StartIdx + j) 4933 return SDValue(); 4934 } 4935 if (StartIdx == -1) 4936 MaskVec.push_back(-1); 4937 else 4938 MaskVec.push_back(StartIdx / Scale); 4939 } 4940 4941 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 4942 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 4943 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4944} 4945 4946/// getVZextMovL - Return a zero-extending vector move low node. 4947/// 4948static SDValue getVZextMovL(EVT VT, EVT OpVT, 4949 SDValue SrcOp, SelectionDAG &DAG, 4950 const X86Subtarget *Subtarget, DebugLoc dl) { 4951 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4952 LoadSDNode *LD = NULL; 4953 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4954 LD = dyn_cast<LoadSDNode>(SrcOp); 4955 if (!LD) { 4956 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4957 // instead. 4958 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4959 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 4960 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4961 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 4962 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4963 // PR2108 4964 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4965 return DAG.getNode(ISD::BITCAST, dl, VT, 4966 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4967 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4968 OpVT, 4969 SrcOp.getOperand(0) 4970 .getOperand(0)))); 4971 } 4972 } 4973 } 4974 4975 return DAG.getNode(ISD::BITCAST, dl, VT, 4976 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4977 DAG.getNode(ISD::BITCAST, dl, 4978 OpVT, SrcOp))); 4979} 4980 4981/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4982/// shuffles. 4983static SDValue 4984LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4985 SDValue V1 = SVOp->getOperand(0); 4986 SDValue V2 = SVOp->getOperand(1); 4987 DebugLoc dl = SVOp->getDebugLoc(); 4988 EVT VT = SVOp->getValueType(0); 4989 4990 SmallVector<std::pair<int, int>, 8> Locs; 4991 Locs.resize(4); 4992 SmallVector<int, 8> Mask1(4U, -1); 4993 SmallVector<int, 8> PermMask; 4994 SVOp->getMask(PermMask); 4995 4996 unsigned NumHi = 0; 4997 unsigned NumLo = 0; 4998 for (unsigned i = 0; i != 4; ++i) { 4999 int Idx = PermMask[i]; 5000 if (Idx < 0) { 5001 Locs[i] = std::make_pair(-1, -1); 5002 } else { 5003 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 5004 if (Idx < 4) { 5005 Locs[i] = std::make_pair(0, NumLo); 5006 Mask1[NumLo] = Idx; 5007 NumLo++; 5008 } else { 5009 Locs[i] = std::make_pair(1, NumHi); 5010 if (2+NumHi < 4) 5011 Mask1[2+NumHi] = Idx; 5012 NumHi++; 5013 } 5014 } 5015 } 5016 5017 if (NumLo <= 2 && NumHi <= 2) { 5018 // If no more than two elements come from either vector. This can be 5019 // implemented with two shuffles. First shuffle gather the elements. 5020 // The second shuffle, which takes the first shuffle as both of its 5021 // vector operands, put the elements into the right order. 5022 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5023 5024 SmallVector<int, 8> Mask2(4U, -1); 5025 5026 for (unsigned i = 0; i != 4; ++i) { 5027 if (Locs[i].first == -1) 5028 continue; 5029 else { 5030 unsigned Idx = (i < 2) ? 0 : 4; 5031 Idx += Locs[i].first * 2 + Locs[i].second; 5032 Mask2[i] = Idx; 5033 } 5034 } 5035 5036 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 5037 } else if (NumLo == 3 || NumHi == 3) { 5038 // Otherwise, we must have three elements from one vector, call it X, and 5039 // one element from the other, call it Y. First, use a shufps to build an 5040 // intermediate vector with the one element from Y and the element from X 5041 // that will be in the same half in the final destination (the indexes don't 5042 // matter). Then, use a shufps to build the final vector, taking the half 5043 // containing the element from Y from the intermediate, and the other half 5044 // from X. 5045 if (NumHi == 3) { 5046 // Normalize it so the 3 elements come from V1. 5047 CommuteVectorShuffleMask(PermMask, VT); 5048 std::swap(V1, V2); 5049 } 5050 5051 // Find the element from V2. 5052 unsigned HiIndex; 5053 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 5054 int Val = PermMask[HiIndex]; 5055 if (Val < 0) 5056 continue; 5057 if (Val >= 4) 5058 break; 5059 } 5060 5061 Mask1[0] = PermMask[HiIndex]; 5062 Mask1[1] = -1; 5063 Mask1[2] = PermMask[HiIndex^1]; 5064 Mask1[3] = -1; 5065 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5066 5067 if (HiIndex >= 2) { 5068 Mask1[0] = PermMask[0]; 5069 Mask1[1] = PermMask[1]; 5070 Mask1[2] = HiIndex & 1 ? 6 : 4; 5071 Mask1[3] = HiIndex & 1 ? 4 : 6; 5072 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5073 } else { 5074 Mask1[0] = HiIndex & 1 ? 2 : 0; 5075 Mask1[1] = HiIndex & 1 ? 0 : 2; 5076 Mask1[2] = PermMask[2]; 5077 Mask1[3] = PermMask[3]; 5078 if (Mask1[2] >= 0) 5079 Mask1[2] += 4; 5080 if (Mask1[3] >= 0) 5081 Mask1[3] += 4; 5082 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5083 } 5084 } 5085 5086 // Break it into (shuffle shuffle_hi, shuffle_lo). 5087 Locs.clear(); 5088 SmallVector<int,8> LoMask(4U, -1); 5089 SmallVector<int,8> HiMask(4U, -1); 5090 5091 SmallVector<int,8> *MaskPtr = &LoMask; 5092 unsigned MaskIdx = 0; 5093 unsigned LoIdx = 0; 5094 unsigned HiIdx = 2; 5095 for (unsigned i = 0; i != 4; ++i) { 5096 if (i == 2) { 5097 MaskPtr = &HiMask; 5098 MaskIdx = 1; 5099 LoIdx = 0; 5100 HiIdx = 2; 5101 } 5102 int Idx = PermMask[i]; 5103 if (Idx < 0) { 5104 Locs[i] = std::make_pair(-1, -1); 5105 } else if (Idx < 4) { 5106 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5107 (*MaskPtr)[LoIdx] = Idx; 5108 LoIdx++; 5109 } else { 5110 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5111 (*MaskPtr)[HiIdx] = Idx; 5112 HiIdx++; 5113 } 5114 } 5115 5116 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5117 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5118 SmallVector<int, 8> MaskOps; 5119 for (unsigned i = 0; i != 4; ++i) { 5120 if (Locs[i].first == -1) { 5121 MaskOps.push_back(-1); 5122 } else { 5123 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5124 MaskOps.push_back(Idx); 5125 } 5126 } 5127 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5128} 5129 5130static bool MayFoldVectorLoad(SDValue V) { 5131 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5132 V = V.getOperand(0); 5133 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5134 V = V.getOperand(0); 5135 if (MayFoldLoad(V)) 5136 return true; 5137 return false; 5138} 5139 5140// FIXME: the version above should always be used. Since there's 5141// a bug where several vector shuffles can't be folded because the 5142// DAG is not updated during lowering and a node claims to have two 5143// uses while it only has one, use this version, and let isel match 5144// another instruction if the load really happens to have more than 5145// one use. Remove this version after this bug get fixed. 5146// rdar://8434668, PR8156 5147static bool RelaxedMayFoldVectorLoad(SDValue V) { 5148 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5149 V = V.getOperand(0); 5150 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5151 V = V.getOperand(0); 5152 if (ISD::isNormalLoad(V.getNode())) 5153 return true; 5154 return false; 5155} 5156 5157/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5158/// a vector extract, and if both can be later optimized into a single load. 5159/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5160/// here because otherwise a target specific shuffle node is going to be 5161/// emitted for this shuffle, and the optimization not done. 5162/// FIXME: This is probably not the best approach, but fix the problem 5163/// until the right path is decided. 5164static 5165bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5166 const TargetLowering &TLI) { 5167 EVT VT = V.getValueType(); 5168 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5169 5170 // Be sure that the vector shuffle is present in a pattern like this: 5171 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5172 if (!V.hasOneUse()) 5173 return false; 5174 5175 SDNode *N = *V.getNode()->use_begin(); 5176 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5177 return false; 5178 5179 SDValue EltNo = N->getOperand(1); 5180 if (!isa<ConstantSDNode>(EltNo)) 5181 return false; 5182 5183 // If the bit convert changed the number of elements, it is unsafe 5184 // to examine the mask. 5185 bool HasShuffleIntoBitcast = false; 5186 if (V.getOpcode() == ISD::BITCAST) { 5187 EVT SrcVT = V.getOperand(0).getValueType(); 5188 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5189 return false; 5190 V = V.getOperand(0); 5191 HasShuffleIntoBitcast = true; 5192 } 5193 5194 // Select the input vector, guarding against out of range extract vector. 5195 unsigned NumElems = VT.getVectorNumElements(); 5196 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5197 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5198 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5199 5200 // Skip one more bit_convert if necessary 5201 if (V.getOpcode() == ISD::BITCAST) 5202 V = V.getOperand(0); 5203 5204 if (ISD::isNormalLoad(V.getNode())) { 5205 // Is the original load suitable? 5206 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5207 5208 // FIXME: avoid the multi-use bug that is preventing lots of 5209 // of foldings to be detected, this is still wrong of course, but 5210 // give the temporary desired behavior, and if it happens that 5211 // the load has real more uses, during isel it will not fold, and 5212 // will generate poor code. 5213 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5214 return false; 5215 5216 if (!HasShuffleIntoBitcast) 5217 return true; 5218 5219 // If there's a bitcast before the shuffle, check if the load type and 5220 // alignment is valid. 5221 unsigned Align = LN0->getAlignment(); 5222 unsigned NewAlign = 5223 TLI.getTargetData()->getABITypeAlignment( 5224 VT.getTypeForEVT(*DAG.getContext())); 5225 5226 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5227 return false; 5228 } 5229 5230 return true; 5231} 5232 5233static 5234SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 5235 EVT VT = Op.getValueType(); 5236 5237 // Canonizalize to v2f64. 5238 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 5239 return DAG.getNode(ISD::BITCAST, dl, VT, 5240 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 5241 V1, DAG)); 5242} 5243 5244static 5245SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5246 bool HasSSE2) { 5247 SDValue V1 = Op.getOperand(0); 5248 SDValue V2 = Op.getOperand(1); 5249 EVT VT = Op.getValueType(); 5250 5251 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5252 5253 if (HasSSE2 && VT == MVT::v2f64) 5254 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5255 5256 // v4f32 or v4i32 5257 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5258} 5259 5260static 5261SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5262 SDValue V1 = Op.getOperand(0); 5263 SDValue V2 = Op.getOperand(1); 5264 EVT VT = Op.getValueType(); 5265 5266 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5267 "unsupported shuffle type"); 5268 5269 if (V2.getOpcode() == ISD::UNDEF) 5270 V2 = V1; 5271 5272 // v4i32 or v4f32 5273 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5274} 5275 5276static 5277SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5278 SDValue V1 = Op.getOperand(0); 5279 SDValue V2 = Op.getOperand(1); 5280 EVT VT = Op.getValueType(); 5281 unsigned NumElems = VT.getVectorNumElements(); 5282 5283 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5284 // operand of these instructions is only memory, so check if there's a 5285 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5286 // same masks. 5287 bool CanFoldLoad = false; 5288 5289 // Trivial case, when V2 comes from a load. 5290 if (MayFoldVectorLoad(V2)) 5291 CanFoldLoad = true; 5292 5293 // When V1 is a load, it can be folded later into a store in isel, example: 5294 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5295 // turns into: 5296 // (MOVLPSmr addr:$src1, VR128:$src2) 5297 // So, recognize this potential and also use MOVLPS or MOVLPD 5298 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 5299 CanFoldLoad = true; 5300 5301 if (CanFoldLoad) { 5302 if (HasSSE2 && NumElems == 2) 5303 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5304 5305 if (NumElems == 4) 5306 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5307 } 5308 5309 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5310 // movl and movlp will both match v2i64, but v2i64 is never matched by 5311 // movl earlier because we make it strict to avoid messing with the movlp load 5312 // folding logic (see the code above getMOVLP call). Match it here then, 5313 // this is horrible, but will stay like this until we move all shuffle 5314 // matching to x86 specific nodes. Note that for the 1st condition all 5315 // types are matched with movsd. 5316 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5317 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5318 else if (HasSSE2) 5319 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5320 5321 5322 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5323 5324 // Invert the operand order and use SHUFPS to match it. 5325 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5326 X86::getShuffleSHUFImmediate(SVOp), DAG); 5327} 5328 5329static inline unsigned getUNPCKLOpcode(EVT VT) { 5330 switch(VT.getSimpleVT().SimpleTy) { 5331 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 5332 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 5333 case MVT::v4f32: return X86ISD::UNPCKLPS; 5334 case MVT::v2f64: return X86ISD::UNPCKLPD; 5335 case MVT::v16i8: return X86ISD::PUNPCKLBW; 5336 case MVT::v8i16: return X86ISD::PUNPCKLWD; 5337 default: 5338 llvm_unreachable("Unknow type for unpckl"); 5339 } 5340 return 0; 5341} 5342 5343static inline unsigned getUNPCKHOpcode(EVT VT) { 5344 switch(VT.getSimpleVT().SimpleTy) { 5345 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 5346 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 5347 case MVT::v4f32: return X86ISD::UNPCKHPS; 5348 case MVT::v2f64: return X86ISD::UNPCKHPD; 5349 case MVT::v16i8: return X86ISD::PUNPCKHBW; 5350 case MVT::v8i16: return X86ISD::PUNPCKHWD; 5351 default: 5352 llvm_unreachable("Unknow type for unpckh"); 5353 } 5354 return 0; 5355} 5356 5357static 5358SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 5359 const TargetLowering &TLI, 5360 const X86Subtarget *Subtarget) { 5361 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5362 EVT VT = Op.getValueType(); 5363 DebugLoc dl = Op.getDebugLoc(); 5364 SDValue V1 = Op.getOperand(0); 5365 SDValue V2 = Op.getOperand(1); 5366 5367 if (isZeroShuffle(SVOp)) 5368 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5369 5370 // Handle splat operations 5371 if (SVOp->isSplat()) { 5372 // Special case, this is the only place now where it's 5373 // allowed to return a vector_shuffle operation without 5374 // using a target specific node, because *hopefully* it 5375 // will be optimized away by the dag combiner. 5376 if (VT.getVectorNumElements() <= 4 && 5377 CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 5378 return Op; 5379 5380 // Handle splats by matching through known masks 5381 if (VT.getVectorNumElements() <= 4) 5382 return SDValue(); 5383 5384 // Canonicalize all of the remaining to v4f32. 5385 return PromoteSplat(SVOp, DAG); 5386 } 5387 5388 // If the shuffle can be profitably rewritten as a narrower shuffle, then 5389 // do it! 5390 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 5391 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5392 if (NewOp.getNode()) 5393 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 5394 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 5395 // FIXME: Figure out a cleaner way to do this. 5396 // Try to make use of movq to zero out the top part. 5397 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 5398 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5399 if (NewOp.getNode()) { 5400 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 5401 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 5402 DAG, Subtarget, dl); 5403 } 5404 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 5405 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5406 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 5407 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 5408 DAG, Subtarget, dl); 5409 } 5410 } 5411 return SDValue(); 5412} 5413 5414SDValue 5415X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 5416 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5417 SDValue V1 = Op.getOperand(0); 5418 SDValue V2 = Op.getOperand(1); 5419 EVT VT = Op.getValueType(); 5420 DebugLoc dl = Op.getDebugLoc(); 5421 unsigned NumElems = VT.getVectorNumElements(); 5422 bool isMMX = VT.getSizeInBits() == 64; 5423 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 5424 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5425 bool V1IsSplat = false; 5426 bool V2IsSplat = false; 5427 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 5428 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 5429 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 5430 MachineFunction &MF = DAG.getMachineFunction(); 5431 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 5432 5433 // Shuffle operations on MMX not supported. 5434 if (isMMX) 5435 return Op; 5436 5437 // Vector shuffle lowering takes 3 steps: 5438 // 5439 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 5440 // narrowing and commutation of operands should be handled. 5441 // 2) Matching of shuffles with known shuffle masks to x86 target specific 5442 // shuffle nodes. 5443 // 3) Rewriting of unmatched masks into new generic shuffle operations, 5444 // so the shuffle can be broken into other shuffles and the legalizer can 5445 // try the lowering again. 5446 // 5447 // The general ideia is that no vector_shuffle operation should be left to 5448 // be matched during isel, all of them must be converted to a target specific 5449 // node here. 5450 5451 // Normalize the input vectors. Here splats, zeroed vectors, profitable 5452 // narrowing and commutation of operands should be handled. The actual code 5453 // doesn't include all of those, work in progress... 5454 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 5455 if (NewOp.getNode()) 5456 return NewOp; 5457 5458 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 5459 // unpckh_undef). Only use pshufd if speed is more important than size. 5460 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 5461 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5462 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5463 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 5464 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5465 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5466 5467 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 5468 RelaxedMayFoldVectorLoad(V1)) 5469 return getMOVDDup(Op, dl, V1, DAG); 5470 5471 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 5472 return getMOVHighToLow(Op, dl, DAG); 5473 5474 // Use to match splats 5475 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 5476 (VT == MVT::v2f64 || VT == MVT::v2i64)) 5477 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5478 5479 if (X86::isPSHUFDMask(SVOp)) { 5480 // The actual implementation will match the mask in the if above and then 5481 // during isel it can match several different instructions, not only pshufd 5482 // as its name says, sad but true, emulate the behavior for now... 5483 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 5484 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 5485 5486 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5487 5488 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 5489 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 5490 5491 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5492 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 5493 TargetMask, DAG); 5494 5495 if (VT == MVT::v4f32) 5496 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 5497 TargetMask, DAG); 5498 } 5499 5500 // Check if this can be converted into a logical shift. 5501 bool isLeft = false; 5502 unsigned ShAmt = 0; 5503 SDValue ShVal; 5504 bool isShift = getSubtarget()->hasSSE2() && 5505 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 5506 if (isShift && ShVal.hasOneUse()) { 5507 // If the shifted value has multiple uses, it may be cheaper to use 5508 // v_set0 + movlhps or movhlps, etc. 5509 EVT EltVT = VT.getVectorElementType(); 5510 ShAmt *= EltVT.getSizeInBits(); 5511 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5512 } 5513 5514 if (X86::isMOVLMask(SVOp)) { 5515 if (V1IsUndef) 5516 return V2; 5517 if (ISD::isBuildVectorAllZeros(V1.getNode())) 5518 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 5519 if (!X86::isMOVLPMask(SVOp)) { 5520 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5521 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5522 5523 if (VT == MVT::v4i32 || VT == MVT::v4f32) 5524 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5525 } 5526 } 5527 5528 // FIXME: fold these into legal mask. 5529 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 5530 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 5531 5532 if (X86::isMOVHLPSMask(SVOp)) 5533 return getMOVHighToLow(Op, dl, DAG); 5534 5535 if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5536 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 5537 5538 if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5539 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 5540 5541 if (X86::isMOVLPMask(SVOp)) 5542 return getMOVLP(Op, dl, DAG, HasSSE2); 5543 5544 if (ShouldXformToMOVHLPS(SVOp) || 5545 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 5546 return CommuteVectorShuffle(SVOp, DAG); 5547 5548 if (isShift) { 5549 // No better options. Use a vshl / vsrl. 5550 EVT EltVT = VT.getVectorElementType(); 5551 ShAmt *= EltVT.getSizeInBits(); 5552 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5553 } 5554 5555 bool Commuted = false; 5556 // FIXME: This should also accept a bitcast of a splat? Be careful, not 5557 // 1,1,1,1 -> v8i16 though. 5558 V1IsSplat = isSplatVector(V1.getNode()); 5559 V2IsSplat = isSplatVector(V2.getNode()); 5560 5561 // Canonicalize the splat or undef, if present, to be on the RHS. 5562 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 5563 Op = CommuteVectorShuffle(SVOp, DAG); 5564 SVOp = cast<ShuffleVectorSDNode>(Op); 5565 V1 = SVOp->getOperand(0); 5566 V2 = SVOp->getOperand(1); 5567 std::swap(V1IsSplat, V2IsSplat); 5568 std::swap(V1IsUndef, V2IsUndef); 5569 Commuted = true; 5570 } 5571 5572 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 5573 // Shuffling low element of v1 into undef, just return v1. 5574 if (V2IsUndef) 5575 return V1; 5576 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 5577 // the instruction selector will not match, so get a canonical MOVL with 5578 // swapped operands to undo the commute. 5579 return getMOVL(DAG, dl, VT, V2, V1); 5580 } 5581 5582 if (X86::isUNPCKLMask(SVOp)) 5583 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); 5584 5585 if (X86::isUNPCKHMask(SVOp)) 5586 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 5587 5588 if (V2IsSplat) { 5589 // Normalize mask so all entries that point to V2 points to its first 5590 // element then try to match unpck{h|l} again. If match, return a 5591 // new vector_shuffle with the corrected mask. 5592 SDValue NewMask = NormalizeMask(SVOp, DAG); 5593 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 5594 if (NSVOp != SVOp) { 5595 if (X86::isUNPCKLMask(NSVOp, true)) { 5596 return NewMask; 5597 } else if (X86::isUNPCKHMask(NSVOp, true)) { 5598 return NewMask; 5599 } 5600 } 5601 } 5602 5603 if (Commuted) { 5604 // Commute is back and try unpck* again. 5605 // FIXME: this seems wrong. 5606 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 5607 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 5608 5609 if (X86::isUNPCKLMask(NewSVOp)) 5610 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); 5611 5612 if (X86::isUNPCKHMask(NewSVOp)) 5613 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 5614 } 5615 5616 // Normalize the node to match x86 shuffle ops if needed 5617 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 5618 return CommuteVectorShuffle(SVOp, DAG); 5619 5620 // The checks below are all present in isShuffleMaskLegal, but they are 5621 // inlined here right now to enable us to directly emit target specific 5622 // nodes, and remove one by one until they don't return Op anymore. 5623 SmallVector<int, 16> M; 5624 SVOp->getMask(M); 5625 5626 if (isPALIGNRMask(M, VT, HasSSSE3)) 5627 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 5628 X86::getShufflePALIGNRImmediate(SVOp), 5629 DAG); 5630 5631 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 5632 SVOp->getSplatIndex() == 0 && V2IsUndef) { 5633 if (VT == MVT::v2f64) 5634 return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); 5635 if (VT == MVT::v2i64) 5636 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 5637 } 5638 5639 if (isPSHUFHWMask(M, VT)) 5640 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 5641 X86::getShufflePSHUFHWImmediate(SVOp), 5642 DAG); 5643 5644 if (isPSHUFLWMask(M, VT)) 5645 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 5646 X86::getShufflePSHUFLWImmediate(SVOp), 5647 DAG); 5648 5649 if (isSHUFPMask(M, VT)) { 5650 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5651 if (VT == MVT::v4f32 || VT == MVT::v4i32) 5652 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 5653 TargetMask, DAG); 5654 if (VT == MVT::v2f64 || VT == MVT::v2i64) 5655 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 5656 TargetMask, DAG); 5657 } 5658 5659 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 5660 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5661 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5662 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 5663 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5664 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5665 5666 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 5667 if (VT == MVT::v8i16) { 5668 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 5669 if (NewOp.getNode()) 5670 return NewOp; 5671 } 5672 5673 if (VT == MVT::v16i8) { 5674 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 5675 if (NewOp.getNode()) 5676 return NewOp; 5677 } 5678 5679 // Handle all 4 wide cases with a number of shuffles. 5680 if (NumElems == 4) 5681 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 5682 5683 return SDValue(); 5684} 5685 5686SDValue 5687X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 5688 SelectionDAG &DAG) const { 5689 EVT VT = Op.getValueType(); 5690 DebugLoc dl = Op.getDebugLoc(); 5691 if (VT.getSizeInBits() == 8) { 5692 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 5693 Op.getOperand(0), Op.getOperand(1)); 5694 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5695 DAG.getValueType(VT)); 5696 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5697 } else if (VT.getSizeInBits() == 16) { 5698 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5699 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 5700 if (Idx == 0) 5701 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5702 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5703 DAG.getNode(ISD::BITCAST, dl, 5704 MVT::v4i32, 5705 Op.getOperand(0)), 5706 Op.getOperand(1))); 5707 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5708 Op.getOperand(0), Op.getOperand(1)); 5709 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5710 DAG.getValueType(VT)); 5711 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5712 } else if (VT == MVT::f32) { 5713 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5714 // the result back to FR32 register. It's only worth matching if the 5715 // result has a single use which is a store or a bitcast to i32. And in 5716 // the case of a store, it's not worth it if the index is a constant 0, 5717 // because a MOVSSmr can be used instead, which is smaller and faster. 5718 if (!Op.hasOneUse()) 5719 return SDValue(); 5720 SDNode *User = *Op.getNode()->use_begin(); 5721 if ((User->getOpcode() != ISD::STORE || 5722 (isa<ConstantSDNode>(Op.getOperand(1)) && 5723 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5724 (User->getOpcode() != ISD::BITCAST || 5725 User->getValueType(0) != MVT::i32)) 5726 return SDValue(); 5727 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5728 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 5729 Op.getOperand(0)), 5730 Op.getOperand(1)); 5731 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 5732 } else if (VT == MVT::i32) { 5733 // ExtractPS works with constant index. 5734 if (isa<ConstantSDNode>(Op.getOperand(1))) 5735 return Op; 5736 } 5737 return SDValue(); 5738} 5739 5740 5741SDValue 5742X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5743 SelectionDAG &DAG) const { 5744 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5745 return SDValue(); 5746 5747 if (Subtarget->hasSSE41()) { 5748 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5749 if (Res.getNode()) 5750 return Res; 5751 } 5752 5753 EVT VT = Op.getValueType(); 5754 DebugLoc dl = Op.getDebugLoc(); 5755 // TODO: handle v16i8. 5756 if (VT.getSizeInBits() == 16) { 5757 SDValue Vec = Op.getOperand(0); 5758 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5759 if (Idx == 0) 5760 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5761 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5762 DAG.getNode(ISD::BITCAST, dl, 5763 MVT::v4i32, Vec), 5764 Op.getOperand(1))); 5765 // Transform it so it match pextrw which produces a 32-bit result. 5766 EVT EltVT = MVT::i32; 5767 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5768 Op.getOperand(0), Op.getOperand(1)); 5769 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5770 DAG.getValueType(VT)); 5771 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5772 } else if (VT.getSizeInBits() == 32) { 5773 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5774 if (Idx == 0) 5775 return Op; 5776 5777 // SHUFPS the element to the lowest double word, then movss. 5778 int Mask[4] = { Idx, -1, -1, -1 }; 5779 EVT VVT = Op.getOperand(0).getValueType(); 5780 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5781 DAG.getUNDEF(VVT), Mask); 5782 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5783 DAG.getIntPtrConstant(0)); 5784 } else if (VT.getSizeInBits() == 64) { 5785 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5786 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5787 // to match extract_elt for f64. 5788 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5789 if (Idx == 0) 5790 return Op; 5791 5792 // UNPCKHPD the element to the lowest double word, then movsd. 5793 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5794 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5795 int Mask[2] = { 1, -1 }; 5796 EVT VVT = Op.getOperand(0).getValueType(); 5797 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5798 DAG.getUNDEF(VVT), Mask); 5799 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5800 DAG.getIntPtrConstant(0)); 5801 } 5802 5803 return SDValue(); 5804} 5805 5806SDValue 5807X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5808 SelectionDAG &DAG) const { 5809 EVT VT = Op.getValueType(); 5810 EVT EltVT = VT.getVectorElementType(); 5811 DebugLoc dl = Op.getDebugLoc(); 5812 5813 SDValue N0 = Op.getOperand(0); 5814 SDValue N1 = Op.getOperand(1); 5815 SDValue N2 = Op.getOperand(2); 5816 5817 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5818 isa<ConstantSDNode>(N2)) { 5819 unsigned Opc; 5820 if (VT == MVT::v8i16) 5821 Opc = X86ISD::PINSRW; 5822 else if (VT == MVT::v16i8) 5823 Opc = X86ISD::PINSRB; 5824 else 5825 Opc = X86ISD::PINSRB; 5826 5827 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5828 // argument. 5829 if (N1.getValueType() != MVT::i32) 5830 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5831 if (N2.getValueType() != MVT::i32) 5832 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5833 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5834 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5835 // Bits [7:6] of the constant are the source select. This will always be 5836 // zero here. The DAG Combiner may combine an extract_elt index into these 5837 // bits. For example (insert (extract, 3), 2) could be matched by putting 5838 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5839 // Bits [5:4] of the constant are the destination select. This is the 5840 // value of the incoming immediate. 5841 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5842 // combine either bitwise AND or insert of float 0.0 to set these bits. 5843 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5844 // Create this as a scalar to vector.. 5845 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5846 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5847 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5848 // PINSR* works with constant index. 5849 return Op; 5850 } 5851 return SDValue(); 5852} 5853 5854SDValue 5855X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5856 EVT VT = Op.getValueType(); 5857 EVT EltVT = VT.getVectorElementType(); 5858 5859 if (Subtarget->hasSSE41()) 5860 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5861 5862 if (EltVT == MVT::i8) 5863 return SDValue(); 5864 5865 DebugLoc dl = Op.getDebugLoc(); 5866 SDValue N0 = Op.getOperand(0); 5867 SDValue N1 = Op.getOperand(1); 5868 SDValue N2 = Op.getOperand(2); 5869 5870 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5871 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5872 // as its second argument. 5873 if (N1.getValueType() != MVT::i32) 5874 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5875 if (N2.getValueType() != MVT::i32) 5876 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5877 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 5878 } 5879 return SDValue(); 5880} 5881 5882SDValue 5883X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5884 DebugLoc dl = Op.getDebugLoc(); 5885 5886 if (Op.getValueType() == MVT::v1i64 && 5887 Op.getOperand(0).getValueType() == MVT::i64) 5888 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5889 5890 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5891 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 5892 "Expected an SSE type!"); 5893 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 5894 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 5895} 5896 5897// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 5898// a simple subregister reference or explicit instructions to grab 5899// upper bits of a vector. 5900SDValue 5901X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 5902 if (Subtarget->hasAVX()) { 5903 // TODO 5904 } 5905 return SDValue(); 5906} 5907 5908// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 5909// simple superregister reference or explicit instructions to insert 5910// the upper bits of a vector. 5911SDValue 5912X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 5913 if (Subtarget->hasAVX()) { 5914 DebugLoc dl = Op.getNode()->getDebugLoc(); 5915 SDValue Vec = Op.getNode()->getOperand(0); 5916 SDValue SubVec = Op.getNode()->getOperand(1); 5917 SDValue Idx = Op.getNode()->getOperand(2); 5918 5919 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 5920 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 5921 // TODO 5922 } 5923 } 5924 return SDValue(); 5925} 5926 5927// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5928// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5929// one of the above mentioned nodes. It has to be wrapped because otherwise 5930// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5931// be used to form addressing mode. These wrapped nodes will be selected 5932// into MOV32ri. 5933SDValue 5934X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5935 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5936 5937 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5938 // global base reg. 5939 unsigned char OpFlag = 0; 5940 unsigned WrapperKind = X86ISD::Wrapper; 5941 CodeModel::Model M = getTargetMachine().getCodeModel(); 5942 5943 if (Subtarget->isPICStyleRIPRel() && 5944 (M == CodeModel::Small || M == CodeModel::Kernel)) 5945 WrapperKind = X86ISD::WrapperRIP; 5946 else if (Subtarget->isPICStyleGOT()) 5947 OpFlag = X86II::MO_GOTOFF; 5948 else if (Subtarget->isPICStyleStubPIC()) 5949 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5950 5951 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5952 CP->getAlignment(), 5953 CP->getOffset(), OpFlag); 5954 DebugLoc DL = CP->getDebugLoc(); 5955 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5956 // With PIC, the address is actually $g + Offset. 5957 if (OpFlag) { 5958 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5959 DAG.getNode(X86ISD::GlobalBaseReg, 5960 DebugLoc(), getPointerTy()), 5961 Result); 5962 } 5963 5964 return Result; 5965} 5966 5967SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5968 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5969 5970 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5971 // global base reg. 5972 unsigned char OpFlag = 0; 5973 unsigned WrapperKind = X86ISD::Wrapper; 5974 CodeModel::Model M = getTargetMachine().getCodeModel(); 5975 5976 if (Subtarget->isPICStyleRIPRel() && 5977 (M == CodeModel::Small || M == CodeModel::Kernel)) 5978 WrapperKind = X86ISD::WrapperRIP; 5979 else if (Subtarget->isPICStyleGOT()) 5980 OpFlag = X86II::MO_GOTOFF; 5981 else if (Subtarget->isPICStyleStubPIC()) 5982 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5983 5984 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5985 OpFlag); 5986 DebugLoc DL = JT->getDebugLoc(); 5987 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5988 5989 // With PIC, the address is actually $g + Offset. 5990 if (OpFlag) 5991 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5992 DAG.getNode(X86ISD::GlobalBaseReg, 5993 DebugLoc(), getPointerTy()), 5994 Result); 5995 5996 return Result; 5997} 5998 5999SDValue 6000X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 6001 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 6002 6003 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6004 // global base reg. 6005 unsigned char OpFlag = 0; 6006 unsigned WrapperKind = X86ISD::Wrapper; 6007 CodeModel::Model M = getTargetMachine().getCodeModel(); 6008 6009 if (Subtarget->isPICStyleRIPRel() && 6010 (M == CodeModel::Small || M == CodeModel::Kernel)) 6011 WrapperKind = X86ISD::WrapperRIP; 6012 else if (Subtarget->isPICStyleGOT()) 6013 OpFlag = X86II::MO_GOTOFF; 6014 else if (Subtarget->isPICStyleStubPIC()) 6015 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6016 6017 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 6018 6019 DebugLoc DL = Op.getDebugLoc(); 6020 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6021 6022 6023 // With PIC, the address is actually $g + Offset. 6024 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 6025 !Subtarget->is64Bit()) { 6026 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6027 DAG.getNode(X86ISD::GlobalBaseReg, 6028 DebugLoc(), getPointerTy()), 6029 Result); 6030 } 6031 6032 return Result; 6033} 6034 6035SDValue 6036X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 6037 // Create the TargetBlockAddressAddress node. 6038 unsigned char OpFlags = 6039 Subtarget->ClassifyBlockAddressReference(); 6040 CodeModel::Model M = getTargetMachine().getCodeModel(); 6041 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 6042 DebugLoc dl = Op.getDebugLoc(); 6043 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 6044 /*isTarget=*/true, OpFlags); 6045 6046 if (Subtarget->isPICStyleRIPRel() && 6047 (M == CodeModel::Small || M == CodeModel::Kernel)) 6048 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6049 else 6050 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6051 6052 // With PIC, the address is actually $g + Offset. 6053 if (isGlobalRelativeToPICBase(OpFlags)) { 6054 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6055 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6056 Result); 6057 } 6058 6059 return Result; 6060} 6061 6062SDValue 6063X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 6064 int64_t Offset, 6065 SelectionDAG &DAG) const { 6066 // Create the TargetGlobalAddress node, folding in the constant 6067 // offset if it is legal. 6068 unsigned char OpFlags = 6069 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 6070 CodeModel::Model M = getTargetMachine().getCodeModel(); 6071 SDValue Result; 6072 if (OpFlags == X86II::MO_NO_FLAG && 6073 X86::isOffsetSuitableForCodeModel(Offset, M)) { 6074 // A direct static reference to a global. 6075 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 6076 Offset = 0; 6077 } else { 6078 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 6079 } 6080 6081 if (Subtarget->isPICStyleRIPRel() && 6082 (M == CodeModel::Small || M == CodeModel::Kernel)) 6083 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6084 else 6085 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6086 6087 // With PIC, the address is actually $g + Offset. 6088 if (isGlobalRelativeToPICBase(OpFlags)) { 6089 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6090 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6091 Result); 6092 } 6093 6094 // For globals that require a load from a stub to get the address, emit the 6095 // load. 6096 if (isGlobalStubReference(OpFlags)) 6097 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 6098 MachinePointerInfo::getGOT(), false, false, 0); 6099 6100 // If there was a non-zero offset that we didn't fold, create an explicit 6101 // addition for it. 6102 if (Offset != 0) 6103 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 6104 DAG.getConstant(Offset, getPointerTy())); 6105 6106 return Result; 6107} 6108 6109SDValue 6110X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 6111 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6112 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 6113 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 6114} 6115 6116static SDValue 6117GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 6118 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 6119 unsigned char OperandFlags) { 6120 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6121 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6122 DebugLoc dl = GA->getDebugLoc(); 6123 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6124 GA->getValueType(0), 6125 GA->getOffset(), 6126 OperandFlags); 6127 if (InFlag) { 6128 SDValue Ops[] = { Chain, TGA, *InFlag }; 6129 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 6130 } else { 6131 SDValue Ops[] = { Chain, TGA }; 6132 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 6133 } 6134 6135 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 6136 MFI->setAdjustsStack(true); 6137 6138 SDValue Flag = Chain.getValue(1); 6139 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 6140} 6141 6142// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 6143static SDValue 6144LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6145 const EVT PtrVT) { 6146 SDValue InFlag; 6147 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 6148 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 6149 DAG.getNode(X86ISD::GlobalBaseReg, 6150 DebugLoc(), PtrVT), InFlag); 6151 InFlag = Chain.getValue(1); 6152 6153 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 6154} 6155 6156// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 6157static SDValue 6158LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6159 const EVT PtrVT) { 6160 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 6161 X86::RAX, X86II::MO_TLSGD); 6162} 6163 6164// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 6165// "local exec" model. 6166static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6167 const EVT PtrVT, TLSModel::Model model, 6168 bool is64Bit) { 6169 DebugLoc dl = GA->getDebugLoc(); 6170 6171 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 6172 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 6173 is64Bit ? 257 : 256)); 6174 6175 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 6176 DAG.getIntPtrConstant(0), 6177 MachinePointerInfo(Ptr), false, false, 0); 6178 6179 unsigned char OperandFlags = 0; 6180 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 6181 // initialexec. 6182 unsigned WrapperKind = X86ISD::Wrapper; 6183 if (model == TLSModel::LocalExec) { 6184 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 6185 } else if (is64Bit) { 6186 assert(model == TLSModel::InitialExec); 6187 OperandFlags = X86II::MO_GOTTPOFF; 6188 WrapperKind = X86ISD::WrapperRIP; 6189 } else { 6190 assert(model == TLSModel::InitialExec); 6191 OperandFlags = X86II::MO_INDNTPOFF; 6192 } 6193 6194 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 6195 // exec) 6196 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6197 GA->getValueType(0), 6198 GA->getOffset(), OperandFlags); 6199 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 6200 6201 if (model == TLSModel::InitialExec) 6202 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 6203 MachinePointerInfo::getGOT(), false, false, 0); 6204 6205 // The address of the thread local variable is the add of the thread 6206 // pointer with the offset of the variable. 6207 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 6208} 6209 6210SDValue 6211X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 6212 6213 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6214 const GlobalValue *GV = GA->getGlobal(); 6215 6216 if (Subtarget->isTargetELF()) { 6217 // TODO: implement the "local dynamic" model 6218 // TODO: implement the "initial exec"model for pic executables 6219 6220 // If GV is an alias then use the aliasee for determining 6221 // thread-localness. 6222 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 6223 GV = GA->resolveAliasedGlobal(false); 6224 6225 TLSModel::Model model 6226 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 6227 6228 switch (model) { 6229 case TLSModel::GeneralDynamic: 6230 case TLSModel::LocalDynamic: // not implemented 6231 if (Subtarget->is64Bit()) 6232 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 6233 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 6234 6235 case TLSModel::InitialExec: 6236 case TLSModel::LocalExec: 6237 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 6238 Subtarget->is64Bit()); 6239 } 6240 } else if (Subtarget->isTargetDarwin()) { 6241 // Darwin only has one model of TLS. Lower to that. 6242 unsigned char OpFlag = 0; 6243 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 6244 X86ISD::WrapperRIP : X86ISD::Wrapper; 6245 6246 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6247 // global base reg. 6248 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 6249 !Subtarget->is64Bit(); 6250 if (PIC32) 6251 OpFlag = X86II::MO_TLVP_PIC_BASE; 6252 else 6253 OpFlag = X86II::MO_TLVP; 6254 DebugLoc DL = Op.getDebugLoc(); 6255 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 6256 GA->getValueType(0), 6257 GA->getOffset(), OpFlag); 6258 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6259 6260 // With PIC32, the address is actually $g + Offset. 6261 if (PIC32) 6262 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6263 DAG.getNode(X86ISD::GlobalBaseReg, 6264 DebugLoc(), getPointerTy()), 6265 Offset); 6266 6267 // Lowering the machine isd will make sure everything is in the right 6268 // location. 6269 SDValue Chain = DAG.getEntryNode(); 6270 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6271 SDValue Args[] = { Chain, Offset }; 6272 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 6273 6274 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 6275 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6276 MFI->setAdjustsStack(true); 6277 6278 // And our return value (tls address) is in the standard call return value 6279 // location. 6280 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 6281 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 6282 } 6283 6284 assert(false && 6285 "TLS not implemented for this target."); 6286 6287 llvm_unreachable("Unreachable"); 6288 return SDValue(); 6289} 6290 6291 6292/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 6293/// take a 2 x i32 value to shift plus a shift amount. 6294SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 6295 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6296 EVT VT = Op.getValueType(); 6297 unsigned VTBits = VT.getSizeInBits(); 6298 DebugLoc dl = Op.getDebugLoc(); 6299 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 6300 SDValue ShOpLo = Op.getOperand(0); 6301 SDValue ShOpHi = Op.getOperand(1); 6302 SDValue ShAmt = Op.getOperand(2); 6303 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 6304 DAG.getConstant(VTBits - 1, MVT::i8)) 6305 : DAG.getConstant(0, VT); 6306 6307 SDValue Tmp2, Tmp3; 6308 if (Op.getOpcode() == ISD::SHL_PARTS) { 6309 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 6310 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6311 } else { 6312 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 6313 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 6314 } 6315 6316 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 6317 DAG.getConstant(VTBits, MVT::i8)); 6318 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6319 AndNode, DAG.getConstant(0, MVT::i8)); 6320 6321 SDValue Hi, Lo; 6322 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6323 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 6324 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 6325 6326 if (Op.getOpcode() == ISD::SHL_PARTS) { 6327 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6328 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6329 } else { 6330 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6331 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6332 } 6333 6334 SDValue Ops[2] = { Lo, Hi }; 6335 return DAG.getMergeValues(Ops, 2, dl); 6336} 6337 6338SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 6339 SelectionDAG &DAG) const { 6340 EVT SrcVT = Op.getOperand(0).getValueType(); 6341 6342 if (SrcVT.isVector()) 6343 return SDValue(); 6344 6345 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 6346 "Unknown SINT_TO_FP to lower!"); 6347 6348 // These are really Legal; return the operand so the caller accepts it as 6349 // Legal. 6350 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 6351 return Op; 6352 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 6353 Subtarget->is64Bit()) { 6354 return Op; 6355 } 6356 6357 DebugLoc dl = Op.getDebugLoc(); 6358 unsigned Size = SrcVT.getSizeInBits()/8; 6359 MachineFunction &MF = DAG.getMachineFunction(); 6360 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 6361 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6362 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6363 StackSlot, 6364 MachinePointerInfo::getFixedStack(SSFI), 6365 false, false, 0); 6366 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 6367} 6368 6369SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 6370 SDValue StackSlot, 6371 SelectionDAG &DAG) const { 6372 // Build the FILD 6373 DebugLoc DL = Op.getDebugLoc(); 6374 SDVTList Tys; 6375 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 6376 if (useSSE) 6377 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 6378 else 6379 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 6380 6381 unsigned ByteSize = SrcVT.getSizeInBits()/8; 6382 6383 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6384 MachineMemOperand *MMO = 6385 DAG.getMachineFunction() 6386 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6387 MachineMemOperand::MOLoad, ByteSize, ByteSize); 6388 6389 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 6390 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 6391 X86ISD::FILD, DL, 6392 Tys, Ops, array_lengthof(Ops), 6393 SrcVT, MMO); 6394 6395 if (useSSE) { 6396 Chain = Result.getValue(1); 6397 SDValue InFlag = Result.getValue(2); 6398 6399 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 6400 // shouldn't be necessary except that RFP cannot be live across 6401 // multiple blocks. When stackifier is fixed, they can be uncoupled. 6402 MachineFunction &MF = DAG.getMachineFunction(); 6403 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 6404 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 6405 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6406 Tys = DAG.getVTList(MVT::Other); 6407 SDValue Ops[] = { 6408 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 6409 }; 6410 MachineMemOperand *MMO = 6411 DAG.getMachineFunction() 6412 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6413 MachineMemOperand::MOStore, SSFISize, SSFISize); 6414 6415 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 6416 Ops, array_lengthof(Ops), 6417 Op.getValueType(), MMO); 6418 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 6419 MachinePointerInfo::getFixedStack(SSFI), 6420 false, false, 0); 6421 } 6422 6423 return Result; 6424} 6425 6426// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 6427SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 6428 SelectionDAG &DAG) const { 6429 // This algorithm is not obvious. Here it is in C code, more or less: 6430 /* 6431 double uint64_to_double( uint32_t hi, uint32_t lo ) { 6432 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 6433 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 6434 6435 // Copy ints to xmm registers. 6436 __m128i xh = _mm_cvtsi32_si128( hi ); 6437 __m128i xl = _mm_cvtsi32_si128( lo ); 6438 6439 // Combine into low half of a single xmm register. 6440 __m128i x = _mm_unpacklo_epi32( xh, xl ); 6441 __m128d d; 6442 double sd; 6443 6444 // Merge in appropriate exponents to give the integer bits the right 6445 // magnitude. 6446 x = _mm_unpacklo_epi32( x, exp ); 6447 6448 // Subtract away the biases to deal with the IEEE-754 double precision 6449 // implicit 1. 6450 d = _mm_sub_pd( (__m128d) x, bias ); 6451 6452 // All conversions up to here are exact. The correctly rounded result is 6453 // calculated using the current rounding mode using the following 6454 // horizontal add. 6455 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 6456 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 6457 // store doesn't really need to be here (except 6458 // maybe to zero the other double) 6459 return sd; 6460 } 6461 */ 6462 6463 DebugLoc dl = Op.getDebugLoc(); 6464 LLVMContext *Context = DAG.getContext(); 6465 6466 // Build some magic constants. 6467 std::vector<Constant*> CV0; 6468 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 6469 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 6470 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6471 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6472 Constant *C0 = ConstantVector::get(CV0); 6473 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 6474 6475 std::vector<Constant*> CV1; 6476 CV1.push_back( 6477 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 6478 CV1.push_back( 6479 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 6480 Constant *C1 = ConstantVector::get(CV1); 6481 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 6482 6483 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6484 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6485 Op.getOperand(0), 6486 DAG.getIntPtrConstant(1))); 6487 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6488 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6489 Op.getOperand(0), 6490 DAG.getIntPtrConstant(0))); 6491 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 6492 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 6493 MachinePointerInfo::getConstantPool(), 6494 false, false, 16); 6495 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 6496 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 6497 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 6498 MachinePointerInfo::getConstantPool(), 6499 false, false, 16); 6500 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 6501 6502 // Add the halves; easiest way is to swap them into another reg first. 6503 int ShufMask[2] = { 1, -1 }; 6504 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 6505 DAG.getUNDEF(MVT::v2f64), ShufMask); 6506 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 6507 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 6508 DAG.getIntPtrConstant(0)); 6509} 6510 6511// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 6512SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 6513 SelectionDAG &DAG) const { 6514 DebugLoc dl = Op.getDebugLoc(); 6515 // FP constant to bias correct the final result. 6516 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 6517 MVT::f64); 6518 6519 // Load the 32-bit value into an XMM register. 6520 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6521 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6522 Op.getOperand(0), 6523 DAG.getIntPtrConstant(0))); 6524 6525 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6526 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 6527 DAG.getIntPtrConstant(0)); 6528 6529 // Or the load with the bias. 6530 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 6531 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6532 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6533 MVT::v2f64, Load)), 6534 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6535 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6536 MVT::v2f64, Bias))); 6537 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6538 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 6539 DAG.getIntPtrConstant(0)); 6540 6541 // Subtract the bias. 6542 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 6543 6544 // Handle final rounding. 6545 EVT DestVT = Op.getValueType(); 6546 6547 if (DestVT.bitsLT(MVT::f64)) { 6548 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 6549 DAG.getIntPtrConstant(0)); 6550 } else if (DestVT.bitsGT(MVT::f64)) { 6551 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 6552 } 6553 6554 // Handle final rounding. 6555 return Sub; 6556} 6557 6558SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 6559 SelectionDAG &DAG) const { 6560 SDValue N0 = Op.getOperand(0); 6561 DebugLoc dl = Op.getDebugLoc(); 6562 6563 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 6564 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 6565 // the optimization here. 6566 if (DAG.SignBitIsZero(N0)) 6567 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 6568 6569 EVT SrcVT = N0.getValueType(); 6570 EVT DstVT = Op.getValueType(); 6571 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 6572 return LowerUINT_TO_FP_i64(Op, DAG); 6573 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 6574 return LowerUINT_TO_FP_i32(Op, DAG); 6575 6576 // Make a 64-bit buffer, and use it to build an FILD. 6577 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 6578 if (SrcVT == MVT::i32) { 6579 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 6580 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 6581 getPointerTy(), StackSlot, WordOff); 6582 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6583 StackSlot, MachinePointerInfo(), 6584 false, false, 0); 6585 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 6586 OffsetSlot, MachinePointerInfo(), 6587 false, false, 0); 6588 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 6589 return Fild; 6590 } 6591 6592 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 6593 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6594 StackSlot, MachinePointerInfo(), 6595 false, false, 0); 6596 // For i64 source, we need to add the appropriate power of 2 if the input 6597 // was negative. This is the same as the optimization in 6598 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 6599 // we must be careful to do the computation in x87 extended precision, not 6600 // in SSE. (The generic code can't know it's OK to do this, or how to.) 6601 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6602 MachineMemOperand *MMO = 6603 DAG.getMachineFunction() 6604 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6605 MachineMemOperand::MOLoad, 8, 8); 6606 6607 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 6608 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 6609 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 6610 MVT::i64, MMO); 6611 6612 APInt FF(32, 0x5F800000ULL); 6613 6614 // Check whether the sign bit is set. 6615 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 6616 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 6617 ISD::SETLT); 6618 6619 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 6620 SDValue FudgePtr = DAG.getConstantPool( 6621 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 6622 getPointerTy()); 6623 6624 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 6625 SDValue Zero = DAG.getIntPtrConstant(0); 6626 SDValue Four = DAG.getIntPtrConstant(4); 6627 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 6628 Zero, Four); 6629 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 6630 6631 // Load the value out, extending it from f32 to f80. 6632 // FIXME: Avoid the extend by constructing the right constant pool? 6633 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 6634 FudgePtr, MachinePointerInfo::getConstantPool(), 6635 MVT::f32, false, false, 4); 6636 // Extend everything to 80 bits to force it to be done on x87. 6637 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 6638 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 6639} 6640 6641std::pair<SDValue,SDValue> X86TargetLowering:: 6642FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 6643 DebugLoc DL = Op.getDebugLoc(); 6644 6645 EVT DstTy = Op.getValueType(); 6646 6647 if (!IsSigned) { 6648 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 6649 DstTy = MVT::i64; 6650 } 6651 6652 assert(DstTy.getSimpleVT() <= MVT::i64 && 6653 DstTy.getSimpleVT() >= MVT::i16 && 6654 "Unknown FP_TO_SINT to lower!"); 6655 6656 // These are really Legal. 6657 if (DstTy == MVT::i32 && 6658 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6659 return std::make_pair(SDValue(), SDValue()); 6660 if (Subtarget->is64Bit() && 6661 DstTy == MVT::i64 && 6662 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6663 return std::make_pair(SDValue(), SDValue()); 6664 6665 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 6666 // stack slot. 6667 MachineFunction &MF = DAG.getMachineFunction(); 6668 unsigned MemSize = DstTy.getSizeInBits()/8; 6669 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6670 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6671 6672 6673 6674 unsigned Opc; 6675 switch (DstTy.getSimpleVT().SimpleTy) { 6676 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 6677 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 6678 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 6679 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 6680 } 6681 6682 SDValue Chain = DAG.getEntryNode(); 6683 SDValue Value = Op.getOperand(0); 6684 EVT TheVT = Op.getOperand(0).getValueType(); 6685 if (isScalarFPTypeInSSEReg(TheVT)) { 6686 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 6687 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 6688 MachinePointerInfo::getFixedStack(SSFI), 6689 false, false, 0); 6690 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 6691 SDValue Ops[] = { 6692 Chain, StackSlot, DAG.getValueType(TheVT) 6693 }; 6694 6695 MachineMemOperand *MMO = 6696 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6697 MachineMemOperand::MOLoad, MemSize, MemSize); 6698 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 6699 DstTy, MMO); 6700 Chain = Value.getValue(1); 6701 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6702 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6703 } 6704 6705 MachineMemOperand *MMO = 6706 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6707 MachineMemOperand::MOStore, MemSize, MemSize); 6708 6709 // Build the FP_TO_INT*_IN_MEM 6710 SDValue Ops[] = { Chain, Value, StackSlot }; 6711 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 6712 Ops, 3, DstTy, MMO); 6713 6714 return std::make_pair(FIST, StackSlot); 6715} 6716 6717SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 6718 SelectionDAG &DAG) const { 6719 if (Op.getValueType().isVector()) 6720 return SDValue(); 6721 6722 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 6723 SDValue FIST = Vals.first, StackSlot = Vals.second; 6724 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 6725 if (FIST.getNode() == 0) return Op; 6726 6727 // Load the result. 6728 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6729 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 6730} 6731 6732SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 6733 SelectionDAG &DAG) const { 6734 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 6735 SDValue FIST = Vals.first, StackSlot = Vals.second; 6736 assert(FIST.getNode() && "Unexpected failure"); 6737 6738 // Load the result. 6739 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6740 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 6741} 6742 6743SDValue X86TargetLowering::LowerFABS(SDValue Op, 6744 SelectionDAG &DAG) const { 6745 LLVMContext *Context = DAG.getContext(); 6746 DebugLoc dl = Op.getDebugLoc(); 6747 EVT VT = Op.getValueType(); 6748 EVT EltVT = VT; 6749 if (VT.isVector()) 6750 EltVT = VT.getVectorElementType(); 6751 std::vector<Constant*> CV; 6752 if (EltVT == MVT::f64) { 6753 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 6754 CV.push_back(C); 6755 CV.push_back(C); 6756 } else { 6757 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 6758 CV.push_back(C); 6759 CV.push_back(C); 6760 CV.push_back(C); 6761 CV.push_back(C); 6762 } 6763 Constant *C = ConstantVector::get(CV); 6764 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6765 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6766 MachinePointerInfo::getConstantPool(), 6767 false, false, 16); 6768 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 6769} 6770 6771SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 6772 LLVMContext *Context = DAG.getContext(); 6773 DebugLoc dl = Op.getDebugLoc(); 6774 EVT VT = Op.getValueType(); 6775 EVT EltVT = VT; 6776 if (VT.isVector()) 6777 EltVT = VT.getVectorElementType(); 6778 std::vector<Constant*> CV; 6779 if (EltVT == MVT::f64) { 6780 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6781 CV.push_back(C); 6782 CV.push_back(C); 6783 } else { 6784 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6785 CV.push_back(C); 6786 CV.push_back(C); 6787 CV.push_back(C); 6788 CV.push_back(C); 6789 } 6790 Constant *C = ConstantVector::get(CV); 6791 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6792 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6793 MachinePointerInfo::getConstantPool(), 6794 false, false, 16); 6795 if (VT.isVector()) { 6796 return DAG.getNode(ISD::BITCAST, dl, VT, 6797 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6798 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6799 Op.getOperand(0)), 6800 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); 6801 } else { 6802 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6803 } 6804} 6805 6806SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6807 LLVMContext *Context = DAG.getContext(); 6808 SDValue Op0 = Op.getOperand(0); 6809 SDValue Op1 = Op.getOperand(1); 6810 DebugLoc dl = Op.getDebugLoc(); 6811 EVT VT = Op.getValueType(); 6812 EVT SrcVT = Op1.getValueType(); 6813 6814 // If second operand is smaller, extend it first. 6815 if (SrcVT.bitsLT(VT)) { 6816 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6817 SrcVT = VT; 6818 } 6819 // And if it is bigger, shrink it first. 6820 if (SrcVT.bitsGT(VT)) { 6821 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6822 SrcVT = VT; 6823 } 6824 6825 // At this point the operands and the result should have the same 6826 // type, and that won't be f80 since that is not custom lowered. 6827 6828 // First get the sign bit of second operand. 6829 std::vector<Constant*> CV; 6830 if (SrcVT == MVT::f64) { 6831 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6832 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6833 } else { 6834 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6835 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6836 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6837 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6838 } 6839 Constant *C = ConstantVector::get(CV); 6840 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6841 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6842 MachinePointerInfo::getConstantPool(), 6843 false, false, 16); 6844 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6845 6846 // Shift sign bit right or left if the two operands have different types. 6847 if (SrcVT.bitsGT(VT)) { 6848 // Op0 is MVT::f32, Op1 is MVT::f64. 6849 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6850 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6851 DAG.getConstant(32, MVT::i32)); 6852 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 6853 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6854 DAG.getIntPtrConstant(0)); 6855 } 6856 6857 // Clear first operand sign bit. 6858 CV.clear(); 6859 if (VT == MVT::f64) { 6860 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6861 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6862 } else { 6863 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6864 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6865 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6866 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6867 } 6868 C = ConstantVector::get(CV); 6869 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6870 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6871 MachinePointerInfo::getConstantPool(), 6872 false, false, 16); 6873 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6874 6875 // Or the value with the sign bit. 6876 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6877} 6878 6879/// Emit nodes that will be selected as "test Op0,Op0", or something 6880/// equivalent. 6881SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6882 SelectionDAG &DAG) const { 6883 DebugLoc dl = Op.getDebugLoc(); 6884 6885 // CF and OF aren't always set the way we want. Determine which 6886 // of these we need. 6887 bool NeedCF = false; 6888 bool NeedOF = false; 6889 switch (X86CC) { 6890 default: break; 6891 case X86::COND_A: case X86::COND_AE: 6892 case X86::COND_B: case X86::COND_BE: 6893 NeedCF = true; 6894 break; 6895 case X86::COND_G: case X86::COND_GE: 6896 case X86::COND_L: case X86::COND_LE: 6897 case X86::COND_O: case X86::COND_NO: 6898 NeedOF = true; 6899 break; 6900 } 6901 6902 // See if we can use the EFLAGS value from the operand instead of 6903 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6904 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6905 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6906 // Emit a CMP with 0, which is the TEST pattern. 6907 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6908 DAG.getConstant(0, Op.getValueType())); 6909 6910 unsigned Opcode = 0; 6911 unsigned NumOperands = 0; 6912 switch (Op.getNode()->getOpcode()) { 6913 case ISD::ADD: 6914 // Due to an isel shortcoming, be conservative if this add is likely to be 6915 // selected as part of a load-modify-store instruction. When the root node 6916 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6917 // uses of other nodes in the match, such as the ADD in this case. This 6918 // leads to the ADD being left around and reselected, with the result being 6919 // two adds in the output. Alas, even if none our users are stores, that 6920 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6921 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6922 // climbing the DAG back to the root, and it doesn't seem to be worth the 6923 // effort. 6924 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6925 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6926 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6927 goto default_case; 6928 6929 if (ConstantSDNode *C = 6930 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6931 // An add of one will be selected as an INC. 6932 if (C->getAPIntValue() == 1) { 6933 Opcode = X86ISD::INC; 6934 NumOperands = 1; 6935 break; 6936 } 6937 6938 // An add of negative one (subtract of one) will be selected as a DEC. 6939 if (C->getAPIntValue().isAllOnesValue()) { 6940 Opcode = X86ISD::DEC; 6941 NumOperands = 1; 6942 break; 6943 } 6944 } 6945 6946 // Otherwise use a regular EFLAGS-setting add. 6947 Opcode = X86ISD::ADD; 6948 NumOperands = 2; 6949 break; 6950 case ISD::AND: { 6951 // If the primary and result isn't used, don't bother using X86ISD::AND, 6952 // because a TEST instruction will be better. 6953 bool NonFlagUse = false; 6954 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6955 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6956 SDNode *User = *UI; 6957 unsigned UOpNo = UI.getOperandNo(); 6958 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6959 // Look pass truncate. 6960 UOpNo = User->use_begin().getOperandNo(); 6961 User = *User->use_begin(); 6962 } 6963 6964 if (User->getOpcode() != ISD::BRCOND && 6965 User->getOpcode() != ISD::SETCC && 6966 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6967 NonFlagUse = true; 6968 break; 6969 } 6970 } 6971 6972 if (!NonFlagUse) 6973 break; 6974 } 6975 // FALL THROUGH 6976 case ISD::SUB: 6977 case ISD::OR: 6978 case ISD::XOR: 6979 // Due to the ISEL shortcoming noted above, be conservative if this op is 6980 // likely to be selected as part of a load-modify-store instruction. 6981 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6982 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6983 if (UI->getOpcode() == ISD::STORE) 6984 goto default_case; 6985 6986 // Otherwise use a regular EFLAGS-setting instruction. 6987 switch (Op.getNode()->getOpcode()) { 6988 default: llvm_unreachable("unexpected operator!"); 6989 case ISD::SUB: Opcode = X86ISD::SUB; break; 6990 case ISD::OR: Opcode = X86ISD::OR; break; 6991 case ISD::XOR: Opcode = X86ISD::XOR; break; 6992 case ISD::AND: Opcode = X86ISD::AND; break; 6993 } 6994 6995 NumOperands = 2; 6996 break; 6997 case X86ISD::ADD: 6998 case X86ISD::SUB: 6999 case X86ISD::INC: 7000 case X86ISD::DEC: 7001 case X86ISD::OR: 7002 case X86ISD::XOR: 7003 case X86ISD::AND: 7004 return SDValue(Op.getNode(), 1); 7005 default: 7006 default_case: 7007 break; 7008 } 7009 7010 if (Opcode == 0) 7011 // Emit a CMP with 0, which is the TEST pattern. 7012 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7013 DAG.getConstant(0, Op.getValueType())); 7014 7015 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 7016 SmallVector<SDValue, 4> Ops; 7017 for (unsigned i = 0; i != NumOperands; ++i) 7018 Ops.push_back(Op.getOperand(i)); 7019 7020 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 7021 DAG.ReplaceAllUsesWith(Op, New); 7022 return SDValue(New.getNode(), 1); 7023} 7024 7025/// Emit nodes that will be selected as "cmp Op0,Op1", or something 7026/// equivalent. 7027SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 7028 SelectionDAG &DAG) const { 7029 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 7030 if (C->getAPIntValue() == 0) 7031 return EmitTest(Op0, X86CC, DAG); 7032 7033 DebugLoc dl = Op0.getDebugLoc(); 7034 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 7035} 7036 7037/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 7038/// if it's possible. 7039SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 7040 DebugLoc dl, SelectionDAG &DAG) const { 7041 SDValue Op0 = And.getOperand(0); 7042 SDValue Op1 = And.getOperand(1); 7043 if (Op0.getOpcode() == ISD::TRUNCATE) 7044 Op0 = Op0.getOperand(0); 7045 if (Op1.getOpcode() == ISD::TRUNCATE) 7046 Op1 = Op1.getOperand(0); 7047 7048 SDValue LHS, RHS; 7049 if (Op1.getOpcode() == ISD::SHL) 7050 std::swap(Op0, Op1); 7051 if (Op0.getOpcode() == ISD::SHL) { 7052 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 7053 if (And00C->getZExtValue() == 1) { 7054 // If we looked past a truncate, check that it's only truncating away 7055 // known zeros. 7056 unsigned BitWidth = Op0.getValueSizeInBits(); 7057 unsigned AndBitWidth = And.getValueSizeInBits(); 7058 if (BitWidth > AndBitWidth) { 7059 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 7060 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 7061 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 7062 return SDValue(); 7063 } 7064 LHS = Op1; 7065 RHS = Op0.getOperand(1); 7066 } 7067 } else if (Op1.getOpcode() == ISD::Constant) { 7068 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 7069 SDValue AndLHS = Op0; 7070 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 7071 LHS = AndLHS.getOperand(0); 7072 RHS = AndLHS.getOperand(1); 7073 } 7074 } 7075 7076 if (LHS.getNode()) { 7077 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 7078 // instruction. Since the shift amount is in-range-or-undefined, we know 7079 // that doing a bittest on the i32 value is ok. We extend to i32 because 7080 // the encoding for the i16 version is larger than the i32 version. 7081 // Also promote i16 to i32 for performance / code size reason. 7082 if (LHS.getValueType() == MVT::i8 || 7083 LHS.getValueType() == MVT::i16) 7084 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 7085 7086 // If the operand types disagree, extend the shift amount to match. Since 7087 // BT ignores high bits (like shifts) we can use anyextend. 7088 if (LHS.getValueType() != RHS.getValueType()) 7089 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 7090 7091 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 7092 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 7093 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7094 DAG.getConstant(Cond, MVT::i8), BT); 7095 } 7096 7097 return SDValue(); 7098} 7099 7100SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 7101 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 7102 SDValue Op0 = Op.getOperand(0); 7103 SDValue Op1 = Op.getOperand(1); 7104 DebugLoc dl = Op.getDebugLoc(); 7105 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7106 7107 // Optimize to BT if possible. 7108 // Lower (X & (1 << N)) == 0 to BT(X, N). 7109 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 7110 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 7111 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 7112 Op1.getOpcode() == ISD::Constant && 7113 cast<ConstantSDNode>(Op1)->isNullValue() && 7114 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7115 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 7116 if (NewSetCC.getNode()) 7117 return NewSetCC; 7118 } 7119 7120 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 7121 // these. 7122 if (Op1.getOpcode() == ISD::Constant && 7123 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 7124 cast<ConstantSDNode>(Op1)->isNullValue()) && 7125 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7126 7127 // If the input is a setcc, then reuse the input setcc or use a new one with 7128 // the inverted condition. 7129 if (Op0.getOpcode() == X86ISD::SETCC) { 7130 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 7131 bool Invert = (CC == ISD::SETNE) ^ 7132 cast<ConstantSDNode>(Op1)->isNullValue(); 7133 if (!Invert) return Op0; 7134 7135 CCode = X86::GetOppositeBranchCondition(CCode); 7136 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7137 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 7138 } 7139 } 7140 7141 bool isFP = Op1.getValueType().isFloatingPoint(); 7142 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 7143 if (X86CC == X86::COND_INVALID) 7144 return SDValue(); 7145 7146 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 7147 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7148 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 7149} 7150 7151SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 7152 SDValue Cond; 7153 SDValue Op0 = Op.getOperand(0); 7154 SDValue Op1 = Op.getOperand(1); 7155 SDValue CC = Op.getOperand(2); 7156 EVT VT = Op.getValueType(); 7157 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 7158 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 7159 DebugLoc dl = Op.getDebugLoc(); 7160 7161 if (isFP) { 7162 unsigned SSECC = 8; 7163 EVT VT0 = Op0.getValueType(); 7164 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 7165 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 7166 bool Swap = false; 7167 7168 switch (SetCCOpcode) { 7169 default: break; 7170 case ISD::SETOEQ: 7171 case ISD::SETEQ: SSECC = 0; break; 7172 case ISD::SETOGT: 7173 case ISD::SETGT: Swap = true; // Fallthrough 7174 case ISD::SETLT: 7175 case ISD::SETOLT: SSECC = 1; break; 7176 case ISD::SETOGE: 7177 case ISD::SETGE: Swap = true; // Fallthrough 7178 case ISD::SETLE: 7179 case ISD::SETOLE: SSECC = 2; break; 7180 case ISD::SETUO: SSECC = 3; break; 7181 case ISD::SETUNE: 7182 case ISD::SETNE: SSECC = 4; break; 7183 case ISD::SETULE: Swap = true; 7184 case ISD::SETUGE: SSECC = 5; break; 7185 case ISD::SETULT: Swap = true; 7186 case ISD::SETUGT: SSECC = 6; break; 7187 case ISD::SETO: SSECC = 7; break; 7188 } 7189 if (Swap) 7190 std::swap(Op0, Op1); 7191 7192 // In the two special cases we can't handle, emit two comparisons. 7193 if (SSECC == 8) { 7194 if (SetCCOpcode == ISD::SETUEQ) { 7195 SDValue UNORD, EQ; 7196 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 7197 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 7198 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 7199 } 7200 else if (SetCCOpcode == ISD::SETONE) { 7201 SDValue ORD, NEQ; 7202 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 7203 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 7204 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 7205 } 7206 llvm_unreachable("Illegal FP comparison"); 7207 } 7208 // Handle all other FP comparisons here. 7209 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 7210 } 7211 7212 // We are handling one of the integer comparisons here. Since SSE only has 7213 // GT and EQ comparisons for integer, swapping operands and multiple 7214 // operations may be required for some comparisons. 7215 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 7216 bool Swap = false, Invert = false, FlipSigns = false; 7217 7218 switch (VT.getSimpleVT().SimpleTy) { 7219 default: break; 7220 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 7221 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 7222 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 7223 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 7224 } 7225 7226 switch (SetCCOpcode) { 7227 default: break; 7228 case ISD::SETNE: Invert = true; 7229 case ISD::SETEQ: Opc = EQOpc; break; 7230 case ISD::SETLT: Swap = true; 7231 case ISD::SETGT: Opc = GTOpc; break; 7232 case ISD::SETGE: Swap = true; 7233 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 7234 case ISD::SETULT: Swap = true; 7235 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 7236 case ISD::SETUGE: Swap = true; 7237 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 7238 } 7239 if (Swap) 7240 std::swap(Op0, Op1); 7241 7242 // Since SSE has no unsigned integer comparisons, we need to flip the sign 7243 // bits of the inputs before performing those operations. 7244 if (FlipSigns) { 7245 EVT EltVT = VT.getVectorElementType(); 7246 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 7247 EltVT); 7248 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 7249 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 7250 SignBits.size()); 7251 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 7252 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 7253 } 7254 7255 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 7256 7257 // If the logical-not of the result is required, perform that now. 7258 if (Invert) 7259 Result = DAG.getNOT(dl, Result, VT); 7260 7261 return Result; 7262} 7263 7264// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 7265static bool isX86LogicalCmp(SDValue Op) { 7266 unsigned Opc = Op.getNode()->getOpcode(); 7267 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 7268 return true; 7269 if (Op.getResNo() == 1 && 7270 (Opc == X86ISD::ADD || 7271 Opc == X86ISD::SUB || 7272 Opc == X86ISD::ADC || 7273 Opc == X86ISD::SBB || 7274 Opc == X86ISD::SMUL || 7275 Opc == X86ISD::UMUL || 7276 Opc == X86ISD::INC || 7277 Opc == X86ISD::DEC || 7278 Opc == X86ISD::OR || 7279 Opc == X86ISD::XOR || 7280 Opc == X86ISD::AND)) 7281 return true; 7282 7283 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 7284 return true; 7285 7286 return false; 7287} 7288 7289static bool isZero(SDValue V) { 7290 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7291 return C && C->isNullValue(); 7292} 7293 7294static bool isAllOnes(SDValue V) { 7295 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7296 return C && C->isAllOnesValue(); 7297} 7298 7299SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 7300 bool addTest = true; 7301 SDValue Cond = Op.getOperand(0); 7302 SDValue Op1 = Op.getOperand(1); 7303 SDValue Op2 = Op.getOperand(2); 7304 DebugLoc DL = Op.getDebugLoc(); 7305 SDValue CC; 7306 7307 if (Cond.getOpcode() == ISD::SETCC) { 7308 SDValue NewCond = LowerSETCC(Cond, DAG); 7309 if (NewCond.getNode()) 7310 Cond = NewCond; 7311 } 7312 7313 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 7314 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 7315 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 7316 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 7317 if (Cond.getOpcode() == X86ISD::SETCC && 7318 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 7319 isZero(Cond.getOperand(1).getOperand(1))) { 7320 SDValue Cmp = Cond.getOperand(1); 7321 7322 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 7323 7324 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 7325 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 7326 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 7327 7328 SDValue CmpOp0 = Cmp.getOperand(0); 7329 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 7330 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 7331 7332 SDValue Res = // Res = 0 or -1. 7333 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7334 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 7335 7336 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 7337 Res = DAG.getNOT(DL, Res, Res.getValueType()); 7338 7339 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 7340 if (N2C == 0 || !N2C->isNullValue()) 7341 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 7342 return Res; 7343 } 7344 } 7345 7346 // Look past (and (setcc_carry (cmp ...)), 1). 7347 if (Cond.getOpcode() == ISD::AND && 7348 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7349 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7350 if (C && C->getAPIntValue() == 1) 7351 Cond = Cond.getOperand(0); 7352 } 7353 7354 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7355 // setting operand in place of the X86ISD::SETCC. 7356 if (Cond.getOpcode() == X86ISD::SETCC || 7357 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7358 CC = Cond.getOperand(0); 7359 7360 SDValue Cmp = Cond.getOperand(1); 7361 unsigned Opc = Cmp.getOpcode(); 7362 EVT VT = Op.getValueType(); 7363 7364 bool IllegalFPCMov = false; 7365 if (VT.isFloatingPoint() && !VT.isVector() && 7366 !isScalarFPTypeInSSEReg(VT)) // FPStack? 7367 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 7368 7369 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 7370 Opc == X86ISD::BT) { // FIXME 7371 Cond = Cmp; 7372 addTest = false; 7373 } 7374 } 7375 7376 if (addTest) { 7377 // Look pass the truncate. 7378 if (Cond.getOpcode() == ISD::TRUNCATE) 7379 Cond = Cond.getOperand(0); 7380 7381 // We know the result of AND is compared against zero. Try to match 7382 // it to BT. 7383 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7384 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 7385 if (NewSetCC.getNode()) { 7386 CC = NewSetCC.getOperand(0); 7387 Cond = NewSetCC.getOperand(1); 7388 addTest = false; 7389 } 7390 } 7391 } 7392 7393 if (addTest) { 7394 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7395 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7396 } 7397 7398 // a < b ? -1 : 0 -> RES = ~setcc_carry 7399 // a < b ? 0 : -1 -> RES = setcc_carry 7400 // a >= b ? -1 : 0 -> RES = setcc_carry 7401 // a >= b ? 0 : -1 -> RES = ~setcc_carry 7402 if (Cond.getOpcode() == X86ISD::CMP) { 7403 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 7404 7405 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 7406 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 7407 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7408 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 7409 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 7410 return DAG.getNOT(DL, Res, Res.getValueType()); 7411 return Res; 7412 } 7413 } 7414 7415 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 7416 // condition is true. 7417 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 7418 SDValue Ops[] = { Op2, Op1, CC, Cond }; 7419 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 7420} 7421 7422// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 7423// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 7424// from the AND / OR. 7425static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 7426 Opc = Op.getOpcode(); 7427 if (Opc != ISD::OR && Opc != ISD::AND) 7428 return false; 7429 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7430 Op.getOperand(0).hasOneUse() && 7431 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 7432 Op.getOperand(1).hasOneUse()); 7433} 7434 7435// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 7436// 1 and that the SETCC node has a single use. 7437static bool isXor1OfSetCC(SDValue Op) { 7438 if (Op.getOpcode() != ISD::XOR) 7439 return false; 7440 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7441 if (N1C && N1C->getAPIntValue() == 1) { 7442 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7443 Op.getOperand(0).hasOneUse(); 7444 } 7445 return false; 7446} 7447 7448SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 7449 bool addTest = true; 7450 SDValue Chain = Op.getOperand(0); 7451 SDValue Cond = Op.getOperand(1); 7452 SDValue Dest = Op.getOperand(2); 7453 DebugLoc dl = Op.getDebugLoc(); 7454 SDValue CC; 7455 7456 if (Cond.getOpcode() == ISD::SETCC) { 7457 SDValue NewCond = LowerSETCC(Cond, DAG); 7458 if (NewCond.getNode()) 7459 Cond = NewCond; 7460 } 7461#if 0 7462 // FIXME: LowerXALUO doesn't handle these!! 7463 else if (Cond.getOpcode() == X86ISD::ADD || 7464 Cond.getOpcode() == X86ISD::SUB || 7465 Cond.getOpcode() == X86ISD::SMUL || 7466 Cond.getOpcode() == X86ISD::UMUL) 7467 Cond = LowerXALUO(Cond, DAG); 7468#endif 7469 7470 // Look pass (and (setcc_carry (cmp ...)), 1). 7471 if (Cond.getOpcode() == ISD::AND && 7472 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7473 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7474 if (C && C->getAPIntValue() == 1) 7475 Cond = Cond.getOperand(0); 7476 } 7477 7478 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7479 // setting operand in place of the X86ISD::SETCC. 7480 if (Cond.getOpcode() == X86ISD::SETCC || 7481 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7482 CC = Cond.getOperand(0); 7483 7484 SDValue Cmp = Cond.getOperand(1); 7485 unsigned Opc = Cmp.getOpcode(); 7486 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 7487 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 7488 Cond = Cmp; 7489 addTest = false; 7490 } else { 7491 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 7492 default: break; 7493 case X86::COND_O: 7494 case X86::COND_B: 7495 // These can only come from an arithmetic instruction with overflow, 7496 // e.g. SADDO, UADDO. 7497 Cond = Cond.getNode()->getOperand(1); 7498 addTest = false; 7499 break; 7500 } 7501 } 7502 } else { 7503 unsigned CondOpc; 7504 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 7505 SDValue Cmp = Cond.getOperand(0).getOperand(1); 7506 if (CondOpc == ISD::OR) { 7507 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 7508 // two branches instead of an explicit OR instruction with a 7509 // separate test. 7510 if (Cmp == Cond.getOperand(1).getOperand(1) && 7511 isX86LogicalCmp(Cmp)) { 7512 CC = Cond.getOperand(0).getOperand(0); 7513 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7514 Chain, Dest, CC, Cmp); 7515 CC = Cond.getOperand(1).getOperand(0); 7516 Cond = Cmp; 7517 addTest = false; 7518 } 7519 } else { // ISD::AND 7520 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 7521 // two branches instead of an explicit AND instruction with a 7522 // separate test. However, we only do this if this block doesn't 7523 // have a fall-through edge, because this requires an explicit 7524 // jmp when the condition is false. 7525 if (Cmp == Cond.getOperand(1).getOperand(1) && 7526 isX86LogicalCmp(Cmp) && 7527 Op.getNode()->hasOneUse()) { 7528 X86::CondCode CCode = 7529 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7530 CCode = X86::GetOppositeBranchCondition(CCode); 7531 CC = DAG.getConstant(CCode, MVT::i8); 7532 SDNode *User = *Op.getNode()->use_begin(); 7533 // Look for an unconditional branch following this conditional branch. 7534 // We need this because we need to reverse the successors in order 7535 // to implement FCMP_OEQ. 7536 if (User->getOpcode() == ISD::BR) { 7537 SDValue FalseBB = User->getOperand(1); 7538 SDNode *NewBR = 7539 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 7540 assert(NewBR == User); 7541 (void)NewBR; 7542 Dest = FalseBB; 7543 7544 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7545 Chain, Dest, CC, Cmp); 7546 X86::CondCode CCode = 7547 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 7548 CCode = X86::GetOppositeBranchCondition(CCode); 7549 CC = DAG.getConstant(CCode, MVT::i8); 7550 Cond = Cmp; 7551 addTest = false; 7552 } 7553 } 7554 } 7555 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 7556 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 7557 // It should be transformed during dag combiner except when the condition 7558 // is set by a arithmetics with overflow node. 7559 X86::CondCode CCode = 7560 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7561 CCode = X86::GetOppositeBranchCondition(CCode); 7562 CC = DAG.getConstant(CCode, MVT::i8); 7563 Cond = Cond.getOperand(0).getOperand(1); 7564 addTest = false; 7565 } 7566 } 7567 7568 if (addTest) { 7569 // Look pass the truncate. 7570 if (Cond.getOpcode() == ISD::TRUNCATE) 7571 Cond = Cond.getOperand(0); 7572 7573 // We know the result of AND is compared against zero. Try to match 7574 // it to BT. 7575 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7576 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7577 if (NewSetCC.getNode()) { 7578 CC = NewSetCC.getOperand(0); 7579 Cond = NewSetCC.getOperand(1); 7580 addTest = false; 7581 } 7582 } 7583 } 7584 7585 if (addTest) { 7586 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7587 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7588 } 7589 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7590 Chain, Dest, CC, Cond); 7591} 7592 7593 7594// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 7595// Calls to _alloca is needed to probe the stack when allocating more than 4k 7596// bytes in one go. Touching the stack at 4K increments is necessary to ensure 7597// that the guard pages used by the OS virtual memory manager are allocated in 7598// correct sequence. 7599SDValue 7600X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 7601 SelectionDAG &DAG) const { 7602 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 7603 "This should be used only on Windows targets"); 7604 DebugLoc dl = Op.getDebugLoc(); 7605 7606 // Get the inputs. 7607 SDValue Chain = Op.getOperand(0); 7608 SDValue Size = Op.getOperand(1); 7609 // FIXME: Ensure alignment here 7610 7611 SDValue Flag; 7612 7613 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 7614 7615 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 7616 Flag = Chain.getValue(1); 7617 7618 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 7619 7620 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 7621 Flag = Chain.getValue(1); 7622 7623 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 7624 7625 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 7626 return DAG.getMergeValues(Ops1, 2, dl); 7627} 7628 7629SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 7630 MachineFunction &MF = DAG.getMachineFunction(); 7631 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 7632 7633 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7634 DebugLoc DL = Op.getDebugLoc(); 7635 7636 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 7637 // vastart just stores the address of the VarArgsFrameIndex slot into the 7638 // memory location argument. 7639 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7640 getPointerTy()); 7641 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 7642 MachinePointerInfo(SV), false, false, 0); 7643 } 7644 7645 // __va_list_tag: 7646 // gp_offset (0 - 6 * 8) 7647 // fp_offset (48 - 48 + 8 * 16) 7648 // overflow_arg_area (point to parameters coming in memory). 7649 // reg_save_area 7650 SmallVector<SDValue, 8> MemOps; 7651 SDValue FIN = Op.getOperand(1); 7652 // Store gp_offset 7653 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 7654 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 7655 MVT::i32), 7656 FIN, MachinePointerInfo(SV), false, false, 0); 7657 MemOps.push_back(Store); 7658 7659 // Store fp_offset 7660 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7661 FIN, DAG.getIntPtrConstant(4)); 7662 Store = DAG.getStore(Op.getOperand(0), DL, 7663 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 7664 MVT::i32), 7665 FIN, MachinePointerInfo(SV, 4), false, false, 0); 7666 MemOps.push_back(Store); 7667 7668 // Store ptr to overflow_arg_area 7669 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7670 FIN, DAG.getIntPtrConstant(4)); 7671 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7672 getPointerTy()); 7673 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 7674 MachinePointerInfo(SV, 8), 7675 false, false, 0); 7676 MemOps.push_back(Store); 7677 7678 // Store ptr to reg_save_area. 7679 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7680 FIN, DAG.getIntPtrConstant(8)); 7681 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 7682 getPointerTy()); 7683 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 7684 MachinePointerInfo(SV, 16), false, false, 0); 7685 MemOps.push_back(Store); 7686 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 7687 &MemOps[0], MemOps.size()); 7688} 7689 7690SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 7691 assert(Subtarget->is64Bit() && 7692 "LowerVAARG only handles 64-bit va_arg!"); 7693 assert((Subtarget->isTargetLinux() || 7694 Subtarget->isTargetDarwin()) && 7695 "Unhandled target in LowerVAARG"); 7696 assert(Op.getNode()->getNumOperands() == 4); 7697 SDValue Chain = Op.getOperand(0); 7698 SDValue SrcPtr = Op.getOperand(1); 7699 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7700 unsigned Align = Op.getConstantOperandVal(3); 7701 DebugLoc dl = Op.getDebugLoc(); 7702 7703 EVT ArgVT = Op.getNode()->getValueType(0); 7704 const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 7705 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 7706 uint8_t ArgMode; 7707 7708 // Decide which area this value should be read from. 7709 // TODO: Implement the AMD64 ABI in its entirety. This simple 7710 // selection mechanism works only for the basic types. 7711 if (ArgVT == MVT::f80) { 7712 llvm_unreachable("va_arg for f80 not yet implemented"); 7713 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 7714 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 7715 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 7716 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 7717 } else { 7718 llvm_unreachable("Unhandled argument type in LowerVAARG"); 7719 } 7720 7721 if (ArgMode == 2) { 7722 // Sanity Check: Make sure using fp_offset makes sense. 7723 assert(!UseSoftFloat && 7724 !(DAG.getMachineFunction() 7725 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 7726 Subtarget->hasXMM()); 7727 } 7728 7729 // Insert VAARG_64 node into the DAG 7730 // VAARG_64 returns two values: Variable Argument Address, Chain 7731 SmallVector<SDValue, 11> InstOps; 7732 InstOps.push_back(Chain); 7733 InstOps.push_back(SrcPtr); 7734 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 7735 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 7736 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 7737 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 7738 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 7739 VTs, &InstOps[0], InstOps.size(), 7740 MVT::i64, 7741 MachinePointerInfo(SV), 7742 /*Align=*/0, 7743 /*Volatile=*/false, 7744 /*ReadMem=*/true, 7745 /*WriteMem=*/true); 7746 Chain = VAARG.getValue(1); 7747 7748 // Load the next argument and return it 7749 return DAG.getLoad(ArgVT, dl, 7750 Chain, 7751 VAARG, 7752 MachinePointerInfo(), 7753 false, false, 0); 7754} 7755 7756SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 7757 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 7758 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 7759 SDValue Chain = Op.getOperand(0); 7760 SDValue DstPtr = Op.getOperand(1); 7761 SDValue SrcPtr = Op.getOperand(2); 7762 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 7763 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7764 DebugLoc DL = Op.getDebugLoc(); 7765 7766 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 7767 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 7768 false, 7769 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 7770} 7771 7772SDValue 7773X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 7774 DebugLoc dl = Op.getDebugLoc(); 7775 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7776 switch (IntNo) { 7777 default: return SDValue(); // Don't custom lower most intrinsics. 7778 // Comparison intrinsics. 7779 case Intrinsic::x86_sse_comieq_ss: 7780 case Intrinsic::x86_sse_comilt_ss: 7781 case Intrinsic::x86_sse_comile_ss: 7782 case Intrinsic::x86_sse_comigt_ss: 7783 case Intrinsic::x86_sse_comige_ss: 7784 case Intrinsic::x86_sse_comineq_ss: 7785 case Intrinsic::x86_sse_ucomieq_ss: 7786 case Intrinsic::x86_sse_ucomilt_ss: 7787 case Intrinsic::x86_sse_ucomile_ss: 7788 case Intrinsic::x86_sse_ucomigt_ss: 7789 case Intrinsic::x86_sse_ucomige_ss: 7790 case Intrinsic::x86_sse_ucomineq_ss: 7791 case Intrinsic::x86_sse2_comieq_sd: 7792 case Intrinsic::x86_sse2_comilt_sd: 7793 case Intrinsic::x86_sse2_comile_sd: 7794 case Intrinsic::x86_sse2_comigt_sd: 7795 case Intrinsic::x86_sse2_comige_sd: 7796 case Intrinsic::x86_sse2_comineq_sd: 7797 case Intrinsic::x86_sse2_ucomieq_sd: 7798 case Intrinsic::x86_sse2_ucomilt_sd: 7799 case Intrinsic::x86_sse2_ucomile_sd: 7800 case Intrinsic::x86_sse2_ucomigt_sd: 7801 case Intrinsic::x86_sse2_ucomige_sd: 7802 case Intrinsic::x86_sse2_ucomineq_sd: { 7803 unsigned Opc = 0; 7804 ISD::CondCode CC = ISD::SETCC_INVALID; 7805 switch (IntNo) { 7806 default: break; 7807 case Intrinsic::x86_sse_comieq_ss: 7808 case Intrinsic::x86_sse2_comieq_sd: 7809 Opc = X86ISD::COMI; 7810 CC = ISD::SETEQ; 7811 break; 7812 case Intrinsic::x86_sse_comilt_ss: 7813 case Intrinsic::x86_sse2_comilt_sd: 7814 Opc = X86ISD::COMI; 7815 CC = ISD::SETLT; 7816 break; 7817 case Intrinsic::x86_sse_comile_ss: 7818 case Intrinsic::x86_sse2_comile_sd: 7819 Opc = X86ISD::COMI; 7820 CC = ISD::SETLE; 7821 break; 7822 case Intrinsic::x86_sse_comigt_ss: 7823 case Intrinsic::x86_sse2_comigt_sd: 7824 Opc = X86ISD::COMI; 7825 CC = ISD::SETGT; 7826 break; 7827 case Intrinsic::x86_sse_comige_ss: 7828 case Intrinsic::x86_sse2_comige_sd: 7829 Opc = X86ISD::COMI; 7830 CC = ISD::SETGE; 7831 break; 7832 case Intrinsic::x86_sse_comineq_ss: 7833 case Intrinsic::x86_sse2_comineq_sd: 7834 Opc = X86ISD::COMI; 7835 CC = ISD::SETNE; 7836 break; 7837 case Intrinsic::x86_sse_ucomieq_ss: 7838 case Intrinsic::x86_sse2_ucomieq_sd: 7839 Opc = X86ISD::UCOMI; 7840 CC = ISD::SETEQ; 7841 break; 7842 case Intrinsic::x86_sse_ucomilt_ss: 7843 case Intrinsic::x86_sse2_ucomilt_sd: 7844 Opc = X86ISD::UCOMI; 7845 CC = ISD::SETLT; 7846 break; 7847 case Intrinsic::x86_sse_ucomile_ss: 7848 case Intrinsic::x86_sse2_ucomile_sd: 7849 Opc = X86ISD::UCOMI; 7850 CC = ISD::SETLE; 7851 break; 7852 case Intrinsic::x86_sse_ucomigt_ss: 7853 case Intrinsic::x86_sse2_ucomigt_sd: 7854 Opc = X86ISD::UCOMI; 7855 CC = ISD::SETGT; 7856 break; 7857 case Intrinsic::x86_sse_ucomige_ss: 7858 case Intrinsic::x86_sse2_ucomige_sd: 7859 Opc = X86ISD::UCOMI; 7860 CC = ISD::SETGE; 7861 break; 7862 case Intrinsic::x86_sse_ucomineq_ss: 7863 case Intrinsic::x86_sse2_ucomineq_sd: 7864 Opc = X86ISD::UCOMI; 7865 CC = ISD::SETNE; 7866 break; 7867 } 7868 7869 SDValue LHS = Op.getOperand(1); 7870 SDValue RHS = Op.getOperand(2); 7871 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 7872 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 7873 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 7874 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7875 DAG.getConstant(X86CC, MVT::i8), Cond); 7876 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7877 } 7878 // ptest and testp intrinsics. The intrinsic these come from are designed to 7879 // return an integer value, not just an instruction so lower it to the ptest 7880 // or testp pattern and a setcc for the result. 7881 case Intrinsic::x86_sse41_ptestz: 7882 case Intrinsic::x86_sse41_ptestc: 7883 case Intrinsic::x86_sse41_ptestnzc: 7884 case Intrinsic::x86_avx_ptestz_256: 7885 case Intrinsic::x86_avx_ptestc_256: 7886 case Intrinsic::x86_avx_ptestnzc_256: 7887 case Intrinsic::x86_avx_vtestz_ps: 7888 case Intrinsic::x86_avx_vtestc_ps: 7889 case Intrinsic::x86_avx_vtestnzc_ps: 7890 case Intrinsic::x86_avx_vtestz_pd: 7891 case Intrinsic::x86_avx_vtestc_pd: 7892 case Intrinsic::x86_avx_vtestnzc_pd: 7893 case Intrinsic::x86_avx_vtestz_ps_256: 7894 case Intrinsic::x86_avx_vtestc_ps_256: 7895 case Intrinsic::x86_avx_vtestnzc_ps_256: 7896 case Intrinsic::x86_avx_vtestz_pd_256: 7897 case Intrinsic::x86_avx_vtestc_pd_256: 7898 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7899 bool IsTestPacked = false; 7900 unsigned X86CC = 0; 7901 switch (IntNo) { 7902 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7903 case Intrinsic::x86_avx_vtestz_ps: 7904 case Intrinsic::x86_avx_vtestz_pd: 7905 case Intrinsic::x86_avx_vtestz_ps_256: 7906 case Intrinsic::x86_avx_vtestz_pd_256: 7907 IsTestPacked = true; // Fallthrough 7908 case Intrinsic::x86_sse41_ptestz: 7909 case Intrinsic::x86_avx_ptestz_256: 7910 // ZF = 1 7911 X86CC = X86::COND_E; 7912 break; 7913 case Intrinsic::x86_avx_vtestc_ps: 7914 case Intrinsic::x86_avx_vtestc_pd: 7915 case Intrinsic::x86_avx_vtestc_ps_256: 7916 case Intrinsic::x86_avx_vtestc_pd_256: 7917 IsTestPacked = true; // Fallthrough 7918 case Intrinsic::x86_sse41_ptestc: 7919 case Intrinsic::x86_avx_ptestc_256: 7920 // CF = 1 7921 X86CC = X86::COND_B; 7922 break; 7923 case Intrinsic::x86_avx_vtestnzc_ps: 7924 case Intrinsic::x86_avx_vtestnzc_pd: 7925 case Intrinsic::x86_avx_vtestnzc_ps_256: 7926 case Intrinsic::x86_avx_vtestnzc_pd_256: 7927 IsTestPacked = true; // Fallthrough 7928 case Intrinsic::x86_sse41_ptestnzc: 7929 case Intrinsic::x86_avx_ptestnzc_256: 7930 // ZF and CF = 0 7931 X86CC = X86::COND_A; 7932 break; 7933 } 7934 7935 SDValue LHS = Op.getOperand(1); 7936 SDValue RHS = Op.getOperand(2); 7937 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7938 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7939 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7940 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7941 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7942 } 7943 7944 // Fix vector shift instructions where the last operand is a non-immediate 7945 // i32 value. 7946 case Intrinsic::x86_sse2_pslli_w: 7947 case Intrinsic::x86_sse2_pslli_d: 7948 case Intrinsic::x86_sse2_pslli_q: 7949 case Intrinsic::x86_sse2_psrli_w: 7950 case Intrinsic::x86_sse2_psrli_d: 7951 case Intrinsic::x86_sse2_psrli_q: 7952 case Intrinsic::x86_sse2_psrai_w: 7953 case Intrinsic::x86_sse2_psrai_d: 7954 case Intrinsic::x86_mmx_pslli_w: 7955 case Intrinsic::x86_mmx_pslli_d: 7956 case Intrinsic::x86_mmx_pslli_q: 7957 case Intrinsic::x86_mmx_psrli_w: 7958 case Intrinsic::x86_mmx_psrli_d: 7959 case Intrinsic::x86_mmx_psrli_q: 7960 case Intrinsic::x86_mmx_psrai_w: 7961 case Intrinsic::x86_mmx_psrai_d: { 7962 SDValue ShAmt = Op.getOperand(2); 7963 if (isa<ConstantSDNode>(ShAmt)) 7964 return SDValue(); 7965 7966 unsigned NewIntNo = 0; 7967 EVT ShAmtVT = MVT::v4i32; 7968 switch (IntNo) { 7969 case Intrinsic::x86_sse2_pslli_w: 7970 NewIntNo = Intrinsic::x86_sse2_psll_w; 7971 break; 7972 case Intrinsic::x86_sse2_pslli_d: 7973 NewIntNo = Intrinsic::x86_sse2_psll_d; 7974 break; 7975 case Intrinsic::x86_sse2_pslli_q: 7976 NewIntNo = Intrinsic::x86_sse2_psll_q; 7977 break; 7978 case Intrinsic::x86_sse2_psrli_w: 7979 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7980 break; 7981 case Intrinsic::x86_sse2_psrli_d: 7982 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7983 break; 7984 case Intrinsic::x86_sse2_psrli_q: 7985 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7986 break; 7987 case Intrinsic::x86_sse2_psrai_w: 7988 NewIntNo = Intrinsic::x86_sse2_psra_w; 7989 break; 7990 case Intrinsic::x86_sse2_psrai_d: 7991 NewIntNo = Intrinsic::x86_sse2_psra_d; 7992 break; 7993 default: { 7994 ShAmtVT = MVT::v2i32; 7995 switch (IntNo) { 7996 case Intrinsic::x86_mmx_pslli_w: 7997 NewIntNo = Intrinsic::x86_mmx_psll_w; 7998 break; 7999 case Intrinsic::x86_mmx_pslli_d: 8000 NewIntNo = Intrinsic::x86_mmx_psll_d; 8001 break; 8002 case Intrinsic::x86_mmx_pslli_q: 8003 NewIntNo = Intrinsic::x86_mmx_psll_q; 8004 break; 8005 case Intrinsic::x86_mmx_psrli_w: 8006 NewIntNo = Intrinsic::x86_mmx_psrl_w; 8007 break; 8008 case Intrinsic::x86_mmx_psrli_d: 8009 NewIntNo = Intrinsic::x86_mmx_psrl_d; 8010 break; 8011 case Intrinsic::x86_mmx_psrli_q: 8012 NewIntNo = Intrinsic::x86_mmx_psrl_q; 8013 break; 8014 case Intrinsic::x86_mmx_psrai_w: 8015 NewIntNo = Intrinsic::x86_mmx_psra_w; 8016 break; 8017 case Intrinsic::x86_mmx_psrai_d: 8018 NewIntNo = Intrinsic::x86_mmx_psra_d; 8019 break; 8020 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 8021 } 8022 break; 8023 } 8024 } 8025 8026 // The vector shift intrinsics with scalars uses 32b shift amounts but 8027 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 8028 // to be zero. 8029 SDValue ShOps[4]; 8030 ShOps[0] = ShAmt; 8031 ShOps[1] = DAG.getConstant(0, MVT::i32); 8032 if (ShAmtVT == MVT::v4i32) { 8033 ShOps[2] = DAG.getUNDEF(MVT::i32); 8034 ShOps[3] = DAG.getUNDEF(MVT::i32); 8035 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 8036 } else { 8037 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 8038// FIXME this must be lowered to get rid of the invalid type. 8039 } 8040 8041 EVT VT = Op.getValueType(); 8042 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 8043 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8044 DAG.getConstant(NewIntNo, MVT::i32), 8045 Op.getOperand(1), ShAmt); 8046 } 8047 } 8048} 8049 8050SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 8051 SelectionDAG &DAG) const { 8052 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8053 MFI->setReturnAddressIsTaken(true); 8054 8055 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8056 DebugLoc dl = Op.getDebugLoc(); 8057 8058 if (Depth > 0) { 8059 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 8060 SDValue Offset = 8061 DAG.getConstant(TD->getPointerSize(), 8062 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8063 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8064 DAG.getNode(ISD::ADD, dl, getPointerTy(), 8065 FrameAddr, Offset), 8066 MachinePointerInfo(), false, false, 0); 8067 } 8068 8069 // Just load the return address. 8070 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 8071 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8072 RetAddrFI, MachinePointerInfo(), false, false, 0); 8073} 8074 8075SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 8076 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8077 MFI->setFrameAddressIsTaken(true); 8078 8079 EVT VT = Op.getValueType(); 8080 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 8081 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8082 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 8083 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 8084 while (Depth--) 8085 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 8086 MachinePointerInfo(), 8087 false, false, 0); 8088 return FrameAddr; 8089} 8090 8091SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 8092 SelectionDAG &DAG) const { 8093 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 8094} 8095 8096SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 8097 MachineFunction &MF = DAG.getMachineFunction(); 8098 SDValue Chain = Op.getOperand(0); 8099 SDValue Offset = Op.getOperand(1); 8100 SDValue Handler = Op.getOperand(2); 8101 DebugLoc dl = Op.getDebugLoc(); 8102 8103 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 8104 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 8105 getPointerTy()); 8106 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 8107 8108 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 8109 DAG.getIntPtrConstant(TD->getPointerSize())); 8110 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 8111 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 8112 false, false, 0); 8113 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 8114 MF.getRegInfo().addLiveOut(StoreAddrReg); 8115 8116 return DAG.getNode(X86ISD::EH_RETURN, dl, 8117 MVT::Other, 8118 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 8119} 8120 8121SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 8122 SelectionDAG &DAG) const { 8123 SDValue Root = Op.getOperand(0); 8124 SDValue Trmp = Op.getOperand(1); // trampoline 8125 SDValue FPtr = Op.getOperand(2); // nested function 8126 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 8127 DebugLoc dl = Op.getDebugLoc(); 8128 8129 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8130 8131 if (Subtarget->is64Bit()) { 8132 SDValue OutChains[6]; 8133 8134 // Large code-model. 8135 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 8136 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 8137 8138 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 8139 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 8140 8141 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 8142 8143 // Load the pointer to the nested function into R11. 8144 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 8145 SDValue Addr = Trmp; 8146 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8147 Addr, MachinePointerInfo(TrmpAddr), 8148 false, false, 0); 8149 8150 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8151 DAG.getConstant(2, MVT::i64)); 8152 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 8153 MachinePointerInfo(TrmpAddr, 2), 8154 false, false, 2); 8155 8156 // Load the 'nest' parameter value into R10. 8157 // R10 is specified in X86CallingConv.td 8158 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 8159 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8160 DAG.getConstant(10, MVT::i64)); 8161 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8162 Addr, MachinePointerInfo(TrmpAddr, 10), 8163 false, false, 0); 8164 8165 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8166 DAG.getConstant(12, MVT::i64)); 8167 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 8168 MachinePointerInfo(TrmpAddr, 12), 8169 false, false, 2); 8170 8171 // Jump to the nested function. 8172 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 8173 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8174 DAG.getConstant(20, MVT::i64)); 8175 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8176 Addr, MachinePointerInfo(TrmpAddr, 20), 8177 false, false, 0); 8178 8179 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 8180 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8181 DAG.getConstant(22, MVT::i64)); 8182 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 8183 MachinePointerInfo(TrmpAddr, 22), 8184 false, false, 0); 8185 8186 SDValue Ops[] = 8187 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 8188 return DAG.getMergeValues(Ops, 2, dl); 8189 } else { 8190 const Function *Func = 8191 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 8192 CallingConv::ID CC = Func->getCallingConv(); 8193 unsigned NestReg; 8194 8195 switch (CC) { 8196 default: 8197 llvm_unreachable("Unsupported calling convention"); 8198 case CallingConv::C: 8199 case CallingConv::X86_StdCall: { 8200 // Pass 'nest' parameter in ECX. 8201 // Must be kept in sync with X86CallingConv.td 8202 NestReg = X86::ECX; 8203 8204 // Check that ECX wasn't needed by an 'inreg' parameter. 8205 const FunctionType *FTy = Func->getFunctionType(); 8206 const AttrListPtr &Attrs = Func->getAttributes(); 8207 8208 if (!Attrs.isEmpty() && !Func->isVarArg()) { 8209 unsigned InRegCount = 0; 8210 unsigned Idx = 1; 8211 8212 for (FunctionType::param_iterator I = FTy->param_begin(), 8213 E = FTy->param_end(); I != E; ++I, ++Idx) 8214 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 8215 // FIXME: should only count parameters that are lowered to integers. 8216 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 8217 8218 if (InRegCount > 2) { 8219 report_fatal_error("Nest register in use - reduce number of inreg" 8220 " parameters!"); 8221 } 8222 } 8223 break; 8224 } 8225 case CallingConv::X86_FastCall: 8226 case CallingConv::X86_ThisCall: 8227 case CallingConv::Fast: 8228 // Pass 'nest' parameter in EAX. 8229 // Must be kept in sync with X86CallingConv.td 8230 NestReg = X86::EAX; 8231 break; 8232 } 8233 8234 SDValue OutChains[4]; 8235 SDValue Addr, Disp; 8236 8237 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8238 DAG.getConstant(10, MVT::i32)); 8239 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 8240 8241 // This is storing the opcode for MOV32ri. 8242 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 8243 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 8244 OutChains[0] = DAG.getStore(Root, dl, 8245 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 8246 Trmp, MachinePointerInfo(TrmpAddr), 8247 false, false, 0); 8248 8249 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8250 DAG.getConstant(1, MVT::i32)); 8251 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 8252 MachinePointerInfo(TrmpAddr, 1), 8253 false, false, 1); 8254 8255 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 8256 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8257 DAG.getConstant(5, MVT::i32)); 8258 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 8259 MachinePointerInfo(TrmpAddr, 5), 8260 false, false, 1); 8261 8262 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8263 DAG.getConstant(6, MVT::i32)); 8264 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 8265 MachinePointerInfo(TrmpAddr, 6), 8266 false, false, 1); 8267 8268 SDValue Ops[] = 8269 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 8270 return DAG.getMergeValues(Ops, 2, dl); 8271 } 8272} 8273 8274SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 8275 SelectionDAG &DAG) const { 8276 /* 8277 The rounding mode is in bits 11:10 of FPSR, and has the following 8278 settings: 8279 00 Round to nearest 8280 01 Round to -inf 8281 10 Round to +inf 8282 11 Round to 0 8283 8284 FLT_ROUNDS, on the other hand, expects the following: 8285 -1 Undefined 8286 0 Round to 0 8287 1 Round to nearest 8288 2 Round to +inf 8289 3 Round to -inf 8290 8291 To perform the conversion, we do: 8292 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 8293 */ 8294 8295 MachineFunction &MF = DAG.getMachineFunction(); 8296 const TargetMachine &TM = MF.getTarget(); 8297 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 8298 unsigned StackAlignment = TFI.getStackAlignment(); 8299 EVT VT = Op.getValueType(); 8300 DebugLoc DL = Op.getDebugLoc(); 8301 8302 // Save FP Control Word to stack slot 8303 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 8304 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8305 8306 8307 MachineMemOperand *MMO = 8308 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8309 MachineMemOperand::MOStore, 2, 2); 8310 8311 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 8312 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 8313 DAG.getVTList(MVT::Other), 8314 Ops, 2, MVT::i16, MMO); 8315 8316 // Load FP Control Word from stack slot 8317 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 8318 MachinePointerInfo(), false, false, 0); 8319 8320 // Transform as necessary 8321 SDValue CWD1 = 8322 DAG.getNode(ISD::SRL, DL, MVT::i16, 8323 DAG.getNode(ISD::AND, DL, MVT::i16, 8324 CWD, DAG.getConstant(0x800, MVT::i16)), 8325 DAG.getConstant(11, MVT::i8)); 8326 SDValue CWD2 = 8327 DAG.getNode(ISD::SRL, DL, MVT::i16, 8328 DAG.getNode(ISD::AND, DL, MVT::i16, 8329 CWD, DAG.getConstant(0x400, MVT::i16)), 8330 DAG.getConstant(9, MVT::i8)); 8331 8332 SDValue RetVal = 8333 DAG.getNode(ISD::AND, DL, MVT::i16, 8334 DAG.getNode(ISD::ADD, DL, MVT::i16, 8335 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 8336 DAG.getConstant(1, MVT::i16)), 8337 DAG.getConstant(3, MVT::i16)); 8338 8339 8340 return DAG.getNode((VT.getSizeInBits() < 16 ? 8341 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 8342} 8343 8344SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 8345 EVT VT = Op.getValueType(); 8346 EVT OpVT = VT; 8347 unsigned NumBits = VT.getSizeInBits(); 8348 DebugLoc dl = Op.getDebugLoc(); 8349 8350 Op = Op.getOperand(0); 8351 if (VT == MVT::i8) { 8352 // Zero extend to i32 since there is not an i8 bsr. 8353 OpVT = MVT::i32; 8354 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8355 } 8356 8357 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 8358 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8359 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 8360 8361 // If src is zero (i.e. bsr sets ZF), returns NumBits. 8362 SDValue Ops[] = { 8363 Op, 8364 DAG.getConstant(NumBits+NumBits-1, OpVT), 8365 DAG.getConstant(X86::COND_E, MVT::i8), 8366 Op.getValue(1) 8367 }; 8368 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8369 8370 // Finally xor with NumBits-1. 8371 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 8372 8373 if (VT == MVT::i8) 8374 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8375 return Op; 8376} 8377 8378SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 8379 EVT VT = Op.getValueType(); 8380 EVT OpVT = VT; 8381 unsigned NumBits = VT.getSizeInBits(); 8382 DebugLoc dl = Op.getDebugLoc(); 8383 8384 Op = Op.getOperand(0); 8385 if (VT == MVT::i8) { 8386 OpVT = MVT::i32; 8387 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8388 } 8389 8390 // Issue a bsf (scan bits forward) which also sets EFLAGS. 8391 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8392 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 8393 8394 // If src is zero (i.e. bsf sets ZF), returns NumBits. 8395 SDValue Ops[] = { 8396 Op, 8397 DAG.getConstant(NumBits, OpVT), 8398 DAG.getConstant(X86::COND_E, MVT::i8), 8399 Op.getValue(1) 8400 }; 8401 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8402 8403 if (VT == MVT::i8) 8404 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8405 return Op; 8406} 8407 8408SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 8409 EVT VT = Op.getValueType(); 8410 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 8411 DebugLoc dl = Op.getDebugLoc(); 8412 8413 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 8414 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 8415 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 8416 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 8417 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 8418 // 8419 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 8420 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 8421 // return AloBlo + AloBhi + AhiBlo; 8422 8423 SDValue A = Op.getOperand(0); 8424 SDValue B = Op.getOperand(1); 8425 8426 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8427 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8428 A, DAG.getConstant(32, MVT::i32)); 8429 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8430 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8431 B, DAG.getConstant(32, MVT::i32)); 8432 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8433 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8434 A, B); 8435 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8436 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8437 A, Bhi); 8438 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8439 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8440 Ahi, B); 8441 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8442 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8443 AloBhi, DAG.getConstant(32, MVT::i32)); 8444 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8445 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8446 AhiBlo, DAG.getConstant(32, MVT::i32)); 8447 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 8448 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 8449 return Res; 8450} 8451 8452SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 8453 EVT VT = Op.getValueType(); 8454 DebugLoc dl = Op.getDebugLoc(); 8455 SDValue R = Op.getOperand(0); 8456 8457 LLVMContext *Context = DAG.getContext(); 8458 8459 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 8460 8461 if (VT == MVT::v4i32) { 8462 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8463 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8464 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 8465 8466 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 8467 8468 std::vector<Constant*> CV(4, CI); 8469 Constant *C = ConstantVector::get(CV); 8470 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8471 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8472 MachinePointerInfo::getConstantPool(), 8473 false, false, 16); 8474 8475 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 8476 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 8477 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 8478 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 8479 } 8480 if (VT == MVT::v16i8) { 8481 // a = a << 5; 8482 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8483 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8484 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 8485 8486 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 8487 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 8488 8489 std::vector<Constant*> CVM1(16, CM1); 8490 std::vector<Constant*> CVM2(16, CM2); 8491 Constant *C = ConstantVector::get(CVM1); 8492 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8493 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8494 MachinePointerInfo::getConstantPool(), 8495 false, false, 16); 8496 8497 // r = pblendv(r, psllw(r & (char16)15, 4), a); 8498 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8499 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8500 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8501 DAG.getConstant(4, MVT::i32)); 8502 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 8503 // a += a 8504 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8505 8506 C = ConstantVector::get(CVM2); 8507 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8508 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8509 MachinePointerInfo::getConstantPool(), 8510 false, false, 16); 8511 8512 // r = pblendv(r, psllw(r & (char16)63, 2), a); 8513 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8514 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8515 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8516 DAG.getConstant(2, MVT::i32)); 8517 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 8518 // a += a 8519 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8520 8521 // return pblendv(r, r+r, a); 8522 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, 8523 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 8524 return R; 8525 } 8526 return SDValue(); 8527} 8528 8529SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 8530 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 8531 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 8532 // looks for this combo and may remove the "setcc" instruction if the "setcc" 8533 // has only one use. 8534 SDNode *N = Op.getNode(); 8535 SDValue LHS = N->getOperand(0); 8536 SDValue RHS = N->getOperand(1); 8537 unsigned BaseOp = 0; 8538 unsigned Cond = 0; 8539 DebugLoc DL = Op.getDebugLoc(); 8540 switch (Op.getOpcode()) { 8541 default: llvm_unreachable("Unknown ovf instruction!"); 8542 case ISD::SADDO: 8543 // A subtract of one will be selected as a INC. Note that INC doesn't 8544 // set CF, so we can't do this for UADDO. 8545 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8546 if (C->getAPIntValue() == 1) { 8547 BaseOp = X86ISD::INC; 8548 Cond = X86::COND_O; 8549 break; 8550 } 8551 BaseOp = X86ISD::ADD; 8552 Cond = X86::COND_O; 8553 break; 8554 case ISD::UADDO: 8555 BaseOp = X86ISD::ADD; 8556 Cond = X86::COND_B; 8557 break; 8558 case ISD::SSUBO: 8559 // A subtract of one will be selected as a DEC. Note that DEC doesn't 8560 // set CF, so we can't do this for USUBO. 8561 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8562 if (C->getAPIntValue() == 1) { 8563 BaseOp = X86ISD::DEC; 8564 Cond = X86::COND_O; 8565 break; 8566 } 8567 BaseOp = X86ISD::SUB; 8568 Cond = X86::COND_O; 8569 break; 8570 case ISD::USUBO: 8571 BaseOp = X86ISD::SUB; 8572 Cond = X86::COND_B; 8573 break; 8574 case ISD::SMULO: 8575 BaseOp = X86ISD::SMUL; 8576 Cond = X86::COND_O; 8577 break; 8578 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 8579 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 8580 MVT::i32); 8581 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 8582 8583 SDValue SetCC = 8584 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8585 DAG.getConstant(X86::COND_O, MVT::i32), 8586 SDValue(Sum.getNode(), 2)); 8587 8588 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8589 return Sum; 8590 } 8591 } 8592 8593 // Also sets EFLAGS. 8594 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 8595 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 8596 8597 SDValue SetCC = 8598 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 8599 DAG.getConstant(Cond, MVT::i32), 8600 SDValue(Sum.getNode(), 1)); 8601 8602 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8603 return Sum; 8604} 8605 8606SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 8607 DebugLoc dl = Op.getDebugLoc(); 8608 8609 if (!Subtarget->hasSSE2()) { 8610 SDValue Chain = Op.getOperand(0); 8611 SDValue Zero = DAG.getConstant(0, 8612 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8613 SDValue Ops[] = { 8614 DAG.getRegister(X86::ESP, MVT::i32), // Base 8615 DAG.getTargetConstant(1, MVT::i8), // Scale 8616 DAG.getRegister(0, MVT::i32), // Index 8617 DAG.getTargetConstant(0, MVT::i32), // Disp 8618 DAG.getRegister(0, MVT::i32), // Segment. 8619 Zero, 8620 Chain 8621 }; 8622 SDNode *Res = 8623 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 8624 array_lengthof(Ops)); 8625 return SDValue(Res, 0); 8626 } 8627 8628 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 8629 if (!isDev) 8630 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 8631 8632 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8633 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8634 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 8635 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 8636 8637 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 8638 if (!Op1 && !Op2 && !Op3 && Op4) 8639 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 8640 8641 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 8642 if (Op1 && !Op2 && !Op3 && !Op4) 8643 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 8644 8645 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 8646 // (MFENCE)>; 8647 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 8648} 8649 8650SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 8651 EVT T = Op.getValueType(); 8652 DebugLoc DL = Op.getDebugLoc(); 8653 unsigned Reg = 0; 8654 unsigned size = 0; 8655 switch(T.getSimpleVT().SimpleTy) { 8656 default: 8657 assert(false && "Invalid value type!"); 8658 case MVT::i8: Reg = X86::AL; size = 1; break; 8659 case MVT::i16: Reg = X86::AX; size = 2; break; 8660 case MVT::i32: Reg = X86::EAX; size = 4; break; 8661 case MVT::i64: 8662 assert(Subtarget->is64Bit() && "Node not type legal!"); 8663 Reg = X86::RAX; size = 8; 8664 break; 8665 } 8666 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 8667 Op.getOperand(2), SDValue()); 8668 SDValue Ops[] = { cpIn.getValue(0), 8669 Op.getOperand(1), 8670 Op.getOperand(3), 8671 DAG.getTargetConstant(size, MVT::i8), 8672 cpIn.getValue(1) }; 8673 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 8674 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 8675 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 8676 Ops, 5, T, MMO); 8677 SDValue cpOut = 8678 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 8679 return cpOut; 8680} 8681 8682SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 8683 SelectionDAG &DAG) const { 8684 assert(Subtarget->is64Bit() && "Result not type legalized?"); 8685 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 8686 SDValue TheChain = Op.getOperand(0); 8687 DebugLoc dl = Op.getDebugLoc(); 8688 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8689 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 8690 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 8691 rax.getValue(2)); 8692 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 8693 DAG.getConstant(32, MVT::i8)); 8694 SDValue Ops[] = { 8695 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 8696 rdx.getValue(1) 8697 }; 8698 return DAG.getMergeValues(Ops, 2, dl); 8699} 8700 8701SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 8702 SelectionDAG &DAG) const { 8703 EVT SrcVT = Op.getOperand(0).getValueType(); 8704 EVT DstVT = Op.getValueType(); 8705 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 8706 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 8707 assert((DstVT == MVT::i64 || 8708 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 8709 "Unexpected custom BITCAST"); 8710 // i64 <=> MMX conversions are Legal. 8711 if (SrcVT==MVT::i64 && DstVT.isVector()) 8712 return Op; 8713 if (DstVT==MVT::i64 && SrcVT.isVector()) 8714 return Op; 8715 // MMX <=> MMX conversions are Legal. 8716 if (SrcVT.isVector() && DstVT.isVector()) 8717 return Op; 8718 // All other conversions need to be expanded. 8719 return SDValue(); 8720} 8721 8722SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 8723 SDNode *Node = Op.getNode(); 8724 DebugLoc dl = Node->getDebugLoc(); 8725 EVT T = Node->getValueType(0); 8726 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 8727 DAG.getConstant(0, T), Node->getOperand(2)); 8728 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 8729 cast<AtomicSDNode>(Node)->getMemoryVT(), 8730 Node->getOperand(0), 8731 Node->getOperand(1), negOp, 8732 cast<AtomicSDNode>(Node)->getSrcValue(), 8733 cast<AtomicSDNode>(Node)->getAlignment()); 8734} 8735 8736static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 8737 EVT VT = Op.getNode()->getValueType(0); 8738 8739 // Let legalize expand this if it isn't a legal type yet. 8740 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 8741 return SDValue(); 8742 8743 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 8744 8745 unsigned Opc; 8746 bool ExtraOp = false; 8747 switch (Op.getOpcode()) { 8748 default: assert(0 && "Invalid code"); 8749 case ISD::ADDC: Opc = X86ISD::ADD; break; 8750 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 8751 case ISD::SUBC: Opc = X86ISD::SUB; break; 8752 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 8753 } 8754 8755 if (!ExtraOp) 8756 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 8757 Op.getOperand(1)); 8758 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 8759 Op.getOperand(1), Op.getOperand(2)); 8760} 8761 8762/// LowerOperation - Provide custom lowering hooks for some operations. 8763/// 8764SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 8765 switch (Op.getOpcode()) { 8766 default: llvm_unreachable("Should not custom lower this!"); 8767 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 8768 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 8769 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 8770 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 8771 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 8772 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8773 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8774 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8775 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 8776 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 8777 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 8778 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 8779 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 8780 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 8781 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 8782 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 8783 case ISD::SHL_PARTS: 8784 case ISD::SRA_PARTS: 8785 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 8786 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 8787 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 8788 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 8789 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 8790 case ISD::FABS: return LowerFABS(Op, DAG); 8791 case ISD::FNEG: return LowerFNEG(Op, DAG); 8792 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 8793 case ISD::SETCC: return LowerSETCC(Op, DAG); 8794 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 8795 case ISD::SELECT: return LowerSELECT(Op, DAG); 8796 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 8797 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 8798 case ISD::VASTART: return LowerVASTART(Op, DAG); 8799 case ISD::VAARG: return LowerVAARG(Op, DAG); 8800 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 8801 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 8802 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 8803 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 8804 case ISD::FRAME_TO_ARGS_OFFSET: 8805 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 8806 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 8807 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 8808 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 8809 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8810 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 8811 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 8812 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 8813 case ISD::SHL: return LowerSHL(Op, DAG); 8814 case ISD::SADDO: 8815 case ISD::UADDO: 8816 case ISD::SSUBO: 8817 case ISD::USUBO: 8818 case ISD::SMULO: 8819 case ISD::UMULO: return LowerXALUO(Op, DAG); 8820 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 8821 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 8822 case ISD::ADDC: 8823 case ISD::ADDE: 8824 case ISD::SUBC: 8825 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 8826 } 8827} 8828 8829void X86TargetLowering:: 8830ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 8831 SelectionDAG &DAG, unsigned NewOp) const { 8832 EVT T = Node->getValueType(0); 8833 DebugLoc dl = Node->getDebugLoc(); 8834 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 8835 8836 SDValue Chain = Node->getOperand(0); 8837 SDValue In1 = Node->getOperand(1); 8838 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8839 Node->getOperand(2), DAG.getIntPtrConstant(0)); 8840 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8841 Node->getOperand(2), DAG.getIntPtrConstant(1)); 8842 SDValue Ops[] = { Chain, In1, In2L, In2H }; 8843 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 8844 SDValue Result = 8845 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 8846 cast<MemSDNode>(Node)->getMemOperand()); 8847 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 8848 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8849 Results.push_back(Result.getValue(2)); 8850} 8851 8852/// ReplaceNodeResults - Replace a node with an illegal result type 8853/// with a new node built out of custom code. 8854void X86TargetLowering::ReplaceNodeResults(SDNode *N, 8855 SmallVectorImpl<SDValue>&Results, 8856 SelectionDAG &DAG) const { 8857 DebugLoc dl = N->getDebugLoc(); 8858 switch (N->getOpcode()) { 8859 default: 8860 assert(false && "Do not know how to custom type legalize this operation!"); 8861 return; 8862 case ISD::ADDC: 8863 case ISD::ADDE: 8864 case ISD::SUBC: 8865 case ISD::SUBE: 8866 // We don't want to expand or promote these. 8867 return; 8868 case ISD::FP_TO_SINT: { 8869 std::pair<SDValue,SDValue> Vals = 8870 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 8871 SDValue FIST = Vals.first, StackSlot = Vals.second; 8872 if (FIST.getNode() != 0) { 8873 EVT VT = N->getValueType(0); 8874 // Return a load from the stack slot. 8875 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 8876 MachinePointerInfo(), false, false, 0)); 8877 } 8878 return; 8879 } 8880 case ISD::READCYCLECOUNTER: { 8881 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 8882 SDValue TheChain = N->getOperand(0); 8883 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8884 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 8885 rd.getValue(1)); 8886 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 8887 eax.getValue(2)); 8888 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 8889 SDValue Ops[] = { eax, edx }; 8890 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 8891 Results.push_back(edx.getValue(1)); 8892 return; 8893 } 8894 case ISD::ATOMIC_CMP_SWAP: { 8895 EVT T = N->getValueType(0); 8896 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 8897 SDValue cpInL, cpInH; 8898 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8899 DAG.getConstant(0, MVT::i32)); 8900 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8901 DAG.getConstant(1, MVT::i32)); 8902 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 8903 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 8904 cpInL.getValue(1)); 8905 SDValue swapInL, swapInH; 8906 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8907 DAG.getConstant(0, MVT::i32)); 8908 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8909 DAG.getConstant(1, MVT::i32)); 8910 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 8911 cpInH.getValue(1)); 8912 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 8913 swapInL.getValue(1)); 8914 SDValue Ops[] = { swapInH.getValue(0), 8915 N->getOperand(1), 8916 swapInH.getValue(1) }; 8917 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 8918 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 8919 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 8920 Ops, 3, T, MMO); 8921 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 8922 MVT::i32, Result.getValue(1)); 8923 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 8924 MVT::i32, cpOutL.getValue(2)); 8925 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 8926 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8927 Results.push_back(cpOutH.getValue(1)); 8928 return; 8929 } 8930 case ISD::ATOMIC_LOAD_ADD: 8931 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 8932 return; 8933 case ISD::ATOMIC_LOAD_AND: 8934 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 8935 return; 8936 case ISD::ATOMIC_LOAD_NAND: 8937 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 8938 return; 8939 case ISD::ATOMIC_LOAD_OR: 8940 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 8941 return; 8942 case ISD::ATOMIC_LOAD_SUB: 8943 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8944 return; 8945 case ISD::ATOMIC_LOAD_XOR: 8946 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8947 return; 8948 case ISD::ATOMIC_SWAP: 8949 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8950 return; 8951 } 8952} 8953 8954const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8955 switch (Opcode) { 8956 default: return NULL; 8957 case X86ISD::BSF: return "X86ISD::BSF"; 8958 case X86ISD::BSR: return "X86ISD::BSR"; 8959 case X86ISD::SHLD: return "X86ISD::SHLD"; 8960 case X86ISD::SHRD: return "X86ISD::SHRD"; 8961 case X86ISD::FAND: return "X86ISD::FAND"; 8962 case X86ISD::FOR: return "X86ISD::FOR"; 8963 case X86ISD::FXOR: return "X86ISD::FXOR"; 8964 case X86ISD::FSRL: return "X86ISD::FSRL"; 8965 case X86ISD::FILD: return "X86ISD::FILD"; 8966 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8967 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8968 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8969 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8970 case X86ISD::FLD: return "X86ISD::FLD"; 8971 case X86ISD::FST: return "X86ISD::FST"; 8972 case X86ISD::CALL: return "X86ISD::CALL"; 8973 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8974 case X86ISD::BT: return "X86ISD::BT"; 8975 case X86ISD::CMP: return "X86ISD::CMP"; 8976 case X86ISD::COMI: return "X86ISD::COMI"; 8977 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8978 case X86ISD::SETCC: return "X86ISD::SETCC"; 8979 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8980 case X86ISD::CMOV: return "X86ISD::CMOV"; 8981 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8982 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8983 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8984 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8985 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8986 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8987 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8988 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8989 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8990 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8991 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8992 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8993 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8994 case X86ISD::PANDN: return "X86ISD::PANDN"; 8995 case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; 8996 case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; 8997 case X86ISD::PSIGND: return "X86ISD::PSIGND"; 8998 case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; 8999 case X86ISD::FMAX: return "X86ISD::FMAX"; 9000 case X86ISD::FMIN: return "X86ISD::FMIN"; 9001 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 9002 case X86ISD::FRCP: return "X86ISD::FRCP"; 9003 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 9004 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 9005 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 9006 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 9007 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 9008 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 9009 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 9010 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 9011 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 9012 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 9013 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 9014 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 9015 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 9016 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 9017 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 9018 case X86ISD::VSHL: return "X86ISD::VSHL"; 9019 case X86ISD::VSRL: return "X86ISD::VSRL"; 9020 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 9021 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 9022 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 9023 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 9024 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 9025 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 9026 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 9027 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 9028 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 9029 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 9030 case X86ISD::ADD: return "X86ISD::ADD"; 9031 case X86ISD::SUB: return "X86ISD::SUB"; 9032 case X86ISD::ADC: return "X86ISD::ADC"; 9033 case X86ISD::SBB: return "X86ISD::SBB"; 9034 case X86ISD::SMUL: return "X86ISD::SMUL"; 9035 case X86ISD::UMUL: return "X86ISD::UMUL"; 9036 case X86ISD::INC: return "X86ISD::INC"; 9037 case X86ISD::DEC: return "X86ISD::DEC"; 9038 case X86ISD::OR: return "X86ISD::OR"; 9039 case X86ISD::XOR: return "X86ISD::XOR"; 9040 case X86ISD::AND: return "X86ISD::AND"; 9041 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 9042 case X86ISD::PTEST: return "X86ISD::PTEST"; 9043 case X86ISD::TESTP: return "X86ISD::TESTP"; 9044 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 9045 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 9046 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 9047 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 9048 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 9049 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 9050 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 9051 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 9052 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 9053 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 9054 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 9055 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 9056 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 9057 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 9058 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 9059 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 9060 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 9061 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 9062 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 9063 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 9064 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 9065 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 9066 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 9067 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 9068 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 9069 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 9070 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 9071 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 9072 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 9073 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 9074 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 9075 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 9076 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 9077 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 9078 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 9079 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 9080 } 9081} 9082 9083// isLegalAddressingMode - Return true if the addressing mode represented 9084// by AM is legal for this target, for a load/store of the specified type. 9085bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 9086 const Type *Ty) const { 9087 // X86 supports extremely general addressing modes. 9088 CodeModel::Model M = getTargetMachine().getCodeModel(); 9089 Reloc::Model R = getTargetMachine().getRelocationModel(); 9090 9091 // X86 allows a sign-extended 32-bit immediate field as a displacement. 9092 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 9093 return false; 9094 9095 if (AM.BaseGV) { 9096 unsigned GVFlags = 9097 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 9098 9099 // If a reference to this global requires an extra load, we can't fold it. 9100 if (isGlobalStubReference(GVFlags)) 9101 return false; 9102 9103 // If BaseGV requires a register for the PIC base, we cannot also have a 9104 // BaseReg specified. 9105 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 9106 return false; 9107 9108 // If lower 4G is not available, then we must use rip-relative addressing. 9109 if ((M != CodeModel::Small || R != Reloc::Static) && 9110 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 9111 return false; 9112 } 9113 9114 switch (AM.Scale) { 9115 case 0: 9116 case 1: 9117 case 2: 9118 case 4: 9119 case 8: 9120 // These scales always work. 9121 break; 9122 case 3: 9123 case 5: 9124 case 9: 9125 // These scales are formed with basereg+scalereg. Only accept if there is 9126 // no basereg yet. 9127 if (AM.HasBaseReg) 9128 return false; 9129 break; 9130 default: // Other stuff never works. 9131 return false; 9132 } 9133 9134 return true; 9135} 9136 9137 9138bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 9139 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 9140 return false; 9141 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 9142 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 9143 if (NumBits1 <= NumBits2) 9144 return false; 9145 return true; 9146} 9147 9148bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 9149 if (!VT1.isInteger() || !VT2.isInteger()) 9150 return false; 9151 unsigned NumBits1 = VT1.getSizeInBits(); 9152 unsigned NumBits2 = VT2.getSizeInBits(); 9153 if (NumBits1 <= NumBits2) 9154 return false; 9155 return true; 9156} 9157 9158bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 9159 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9160 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 9161} 9162 9163bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 9164 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9165 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 9166} 9167 9168bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 9169 // i16 instructions are longer (0x66 prefix) and potentially slower. 9170 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 9171} 9172 9173/// isShuffleMaskLegal - Targets can use this to indicate that they only 9174/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 9175/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 9176/// are assumed to be legal. 9177bool 9178X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 9179 EVT VT) const { 9180 // Very little shuffling can be done for 64-bit vectors right now. 9181 if (VT.getSizeInBits() == 64) 9182 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 9183 9184 // FIXME: pshufb, blends, shifts. 9185 return (VT.getVectorNumElements() == 2 || 9186 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 9187 isMOVLMask(M, VT) || 9188 isSHUFPMask(M, VT) || 9189 isPSHUFDMask(M, VT) || 9190 isPSHUFHWMask(M, VT) || 9191 isPSHUFLWMask(M, VT) || 9192 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 9193 isUNPCKLMask(M, VT) || 9194 isUNPCKHMask(M, VT) || 9195 isUNPCKL_v_undef_Mask(M, VT) || 9196 isUNPCKH_v_undef_Mask(M, VT)); 9197} 9198 9199bool 9200X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 9201 EVT VT) const { 9202 unsigned NumElts = VT.getVectorNumElements(); 9203 // FIXME: This collection of masks seems suspect. 9204 if (NumElts == 2) 9205 return true; 9206 if (NumElts == 4 && VT.getSizeInBits() == 128) { 9207 return (isMOVLMask(Mask, VT) || 9208 isCommutedMOVLMask(Mask, VT, true) || 9209 isSHUFPMask(Mask, VT) || 9210 isCommutedSHUFPMask(Mask, VT)); 9211 } 9212 return false; 9213} 9214 9215//===----------------------------------------------------------------------===// 9216// X86 Scheduler Hooks 9217//===----------------------------------------------------------------------===// 9218 9219// private utility function 9220MachineBasicBlock * 9221X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 9222 MachineBasicBlock *MBB, 9223 unsigned regOpc, 9224 unsigned immOpc, 9225 unsigned LoadOpc, 9226 unsigned CXchgOpc, 9227 unsigned notOpc, 9228 unsigned EAXreg, 9229 TargetRegisterClass *RC, 9230 bool invSrc) const { 9231 // For the atomic bitwise operator, we generate 9232 // thisMBB: 9233 // newMBB: 9234 // ld t1 = [bitinstr.addr] 9235 // op t2 = t1, [bitinstr.val] 9236 // mov EAX = t1 9237 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9238 // bz newMBB 9239 // fallthrough -->nextMBB 9240 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9241 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9242 MachineFunction::iterator MBBIter = MBB; 9243 ++MBBIter; 9244 9245 /// First build the CFG 9246 MachineFunction *F = MBB->getParent(); 9247 MachineBasicBlock *thisMBB = MBB; 9248 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9249 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9250 F->insert(MBBIter, newMBB); 9251 F->insert(MBBIter, nextMBB); 9252 9253 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9254 nextMBB->splice(nextMBB->begin(), thisMBB, 9255 llvm::next(MachineBasicBlock::iterator(bInstr)), 9256 thisMBB->end()); 9257 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9258 9259 // Update thisMBB to fall through to newMBB 9260 thisMBB->addSuccessor(newMBB); 9261 9262 // newMBB jumps to itself and fall through to nextMBB 9263 newMBB->addSuccessor(nextMBB); 9264 newMBB->addSuccessor(newMBB); 9265 9266 // Insert instructions into newMBB based on incoming instruction 9267 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9268 "unexpected number of operands"); 9269 DebugLoc dl = bInstr->getDebugLoc(); 9270 MachineOperand& destOper = bInstr->getOperand(0); 9271 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9272 int numArgs = bInstr->getNumOperands() - 1; 9273 for (int i=0; i < numArgs; ++i) 9274 argOpers[i] = &bInstr->getOperand(i+1); 9275 9276 // x86 address has 4 operands: base, index, scale, and displacement 9277 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9278 int valArgIndx = lastAddrIndx + 1; 9279 9280 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9281 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 9282 for (int i=0; i <= lastAddrIndx; ++i) 9283 (*MIB).addOperand(*argOpers[i]); 9284 9285 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 9286 if (invSrc) { 9287 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 9288 } 9289 else 9290 tt = t1; 9291 9292 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9293 assert((argOpers[valArgIndx]->isReg() || 9294 argOpers[valArgIndx]->isImm()) && 9295 "invalid operand"); 9296 if (argOpers[valArgIndx]->isReg()) 9297 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 9298 else 9299 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 9300 MIB.addReg(tt); 9301 (*MIB).addOperand(*argOpers[valArgIndx]); 9302 9303 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 9304 MIB.addReg(t1); 9305 9306 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 9307 for (int i=0; i <= lastAddrIndx; ++i) 9308 (*MIB).addOperand(*argOpers[i]); 9309 MIB.addReg(t2); 9310 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9311 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9312 bInstr->memoperands_end()); 9313 9314 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9315 MIB.addReg(EAXreg); 9316 9317 // insert branch 9318 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9319 9320 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9321 return nextMBB; 9322} 9323 9324// private utility function: 64 bit atomics on 32 bit host. 9325MachineBasicBlock * 9326X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 9327 MachineBasicBlock *MBB, 9328 unsigned regOpcL, 9329 unsigned regOpcH, 9330 unsigned immOpcL, 9331 unsigned immOpcH, 9332 bool invSrc) const { 9333 // For the atomic bitwise operator, we generate 9334 // thisMBB (instructions are in pairs, except cmpxchg8b) 9335 // ld t1,t2 = [bitinstr.addr] 9336 // newMBB: 9337 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 9338 // op t5, t6 <- out1, out2, [bitinstr.val] 9339 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 9340 // mov ECX, EBX <- t5, t6 9341 // mov EAX, EDX <- t1, t2 9342 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 9343 // mov t3, t4 <- EAX, EDX 9344 // bz newMBB 9345 // result in out1, out2 9346 // fallthrough -->nextMBB 9347 9348 const TargetRegisterClass *RC = X86::GR32RegisterClass; 9349 const unsigned LoadOpc = X86::MOV32rm; 9350 const unsigned NotOpc = X86::NOT32r; 9351 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9352 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9353 MachineFunction::iterator MBBIter = MBB; 9354 ++MBBIter; 9355 9356 /// First build the CFG 9357 MachineFunction *F = MBB->getParent(); 9358 MachineBasicBlock *thisMBB = MBB; 9359 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9360 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9361 F->insert(MBBIter, newMBB); 9362 F->insert(MBBIter, nextMBB); 9363 9364 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9365 nextMBB->splice(nextMBB->begin(), thisMBB, 9366 llvm::next(MachineBasicBlock::iterator(bInstr)), 9367 thisMBB->end()); 9368 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9369 9370 // Update thisMBB to fall through to newMBB 9371 thisMBB->addSuccessor(newMBB); 9372 9373 // newMBB jumps to itself and fall through to nextMBB 9374 newMBB->addSuccessor(nextMBB); 9375 newMBB->addSuccessor(newMBB); 9376 9377 DebugLoc dl = bInstr->getDebugLoc(); 9378 // Insert instructions into newMBB based on incoming instruction 9379 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 9380 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 9381 "unexpected number of operands"); 9382 MachineOperand& dest1Oper = bInstr->getOperand(0); 9383 MachineOperand& dest2Oper = bInstr->getOperand(1); 9384 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9385 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 9386 argOpers[i] = &bInstr->getOperand(i+2); 9387 9388 // We use some of the operands multiple times, so conservatively just 9389 // clear any kill flags that might be present. 9390 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 9391 argOpers[i]->setIsKill(false); 9392 } 9393 9394 // x86 address has 5 operands: base, index, scale, displacement, and segment. 9395 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9396 9397 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9398 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 9399 for (int i=0; i <= lastAddrIndx; ++i) 9400 (*MIB).addOperand(*argOpers[i]); 9401 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9402 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 9403 // add 4 to displacement. 9404 for (int i=0; i <= lastAddrIndx-2; ++i) 9405 (*MIB).addOperand(*argOpers[i]); 9406 MachineOperand newOp3 = *(argOpers[3]); 9407 if (newOp3.isImm()) 9408 newOp3.setImm(newOp3.getImm()+4); 9409 else 9410 newOp3.setOffset(newOp3.getOffset()+4); 9411 (*MIB).addOperand(newOp3); 9412 (*MIB).addOperand(*argOpers[lastAddrIndx]); 9413 9414 // t3/4 are defined later, at the bottom of the loop 9415 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 9416 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 9417 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 9418 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 9419 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 9420 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 9421 9422 // The subsequent operations should be using the destination registers of 9423 //the PHI instructions. 9424 if (invSrc) { 9425 t1 = F->getRegInfo().createVirtualRegister(RC); 9426 t2 = F->getRegInfo().createVirtualRegister(RC); 9427 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 9428 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 9429 } else { 9430 t1 = dest1Oper.getReg(); 9431 t2 = dest2Oper.getReg(); 9432 } 9433 9434 int valArgIndx = lastAddrIndx + 1; 9435 assert((argOpers[valArgIndx]->isReg() || 9436 argOpers[valArgIndx]->isImm()) && 9437 "invalid operand"); 9438 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 9439 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 9440 if (argOpers[valArgIndx]->isReg()) 9441 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 9442 else 9443 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 9444 if (regOpcL != X86::MOV32rr) 9445 MIB.addReg(t1); 9446 (*MIB).addOperand(*argOpers[valArgIndx]); 9447 assert(argOpers[valArgIndx + 1]->isReg() == 9448 argOpers[valArgIndx]->isReg()); 9449 assert(argOpers[valArgIndx + 1]->isImm() == 9450 argOpers[valArgIndx]->isImm()); 9451 if (argOpers[valArgIndx + 1]->isReg()) 9452 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 9453 else 9454 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 9455 if (regOpcH != X86::MOV32rr) 9456 MIB.addReg(t2); 9457 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 9458 9459 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9460 MIB.addReg(t1); 9461 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 9462 MIB.addReg(t2); 9463 9464 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 9465 MIB.addReg(t5); 9466 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 9467 MIB.addReg(t6); 9468 9469 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 9470 for (int i=0; i <= lastAddrIndx; ++i) 9471 (*MIB).addOperand(*argOpers[i]); 9472 9473 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9474 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9475 bInstr->memoperands_end()); 9476 9477 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 9478 MIB.addReg(X86::EAX); 9479 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 9480 MIB.addReg(X86::EDX); 9481 9482 // insert branch 9483 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9484 9485 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9486 return nextMBB; 9487} 9488 9489// private utility function 9490MachineBasicBlock * 9491X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 9492 MachineBasicBlock *MBB, 9493 unsigned cmovOpc) const { 9494 // For the atomic min/max operator, we generate 9495 // thisMBB: 9496 // newMBB: 9497 // ld t1 = [min/max.addr] 9498 // mov t2 = [min/max.val] 9499 // cmp t1, t2 9500 // cmov[cond] t2 = t1 9501 // mov EAX = t1 9502 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9503 // bz newMBB 9504 // fallthrough -->nextMBB 9505 // 9506 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9507 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9508 MachineFunction::iterator MBBIter = MBB; 9509 ++MBBIter; 9510 9511 /// First build the CFG 9512 MachineFunction *F = MBB->getParent(); 9513 MachineBasicBlock *thisMBB = MBB; 9514 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9515 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9516 F->insert(MBBIter, newMBB); 9517 F->insert(MBBIter, nextMBB); 9518 9519 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9520 nextMBB->splice(nextMBB->begin(), thisMBB, 9521 llvm::next(MachineBasicBlock::iterator(mInstr)), 9522 thisMBB->end()); 9523 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9524 9525 // Update thisMBB to fall through to newMBB 9526 thisMBB->addSuccessor(newMBB); 9527 9528 // newMBB jumps to newMBB and fall through to nextMBB 9529 newMBB->addSuccessor(nextMBB); 9530 newMBB->addSuccessor(newMBB); 9531 9532 DebugLoc dl = mInstr->getDebugLoc(); 9533 // Insert instructions into newMBB based on incoming instruction 9534 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9535 "unexpected number of operands"); 9536 MachineOperand& destOper = mInstr->getOperand(0); 9537 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9538 int numArgs = mInstr->getNumOperands() - 1; 9539 for (int i=0; i < numArgs; ++i) 9540 argOpers[i] = &mInstr->getOperand(i+1); 9541 9542 // x86 address has 4 operands: base, index, scale, and displacement 9543 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9544 int valArgIndx = lastAddrIndx + 1; 9545 9546 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9547 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 9548 for (int i=0; i <= lastAddrIndx; ++i) 9549 (*MIB).addOperand(*argOpers[i]); 9550 9551 // We only support register and immediate values 9552 assert((argOpers[valArgIndx]->isReg() || 9553 argOpers[valArgIndx]->isImm()) && 9554 "invalid operand"); 9555 9556 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9557 if (argOpers[valArgIndx]->isReg()) 9558 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 9559 else 9560 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 9561 (*MIB).addOperand(*argOpers[valArgIndx]); 9562 9563 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9564 MIB.addReg(t1); 9565 9566 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 9567 MIB.addReg(t1); 9568 MIB.addReg(t2); 9569 9570 // Generate movc 9571 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9572 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 9573 MIB.addReg(t2); 9574 MIB.addReg(t1); 9575 9576 // Cmp and exchange if none has modified the memory location 9577 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 9578 for (int i=0; i <= lastAddrIndx; ++i) 9579 (*MIB).addOperand(*argOpers[i]); 9580 MIB.addReg(t3); 9581 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9582 (*MIB).setMemRefs(mInstr->memoperands_begin(), 9583 mInstr->memoperands_end()); 9584 9585 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9586 MIB.addReg(X86::EAX); 9587 9588 // insert branch 9589 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9590 9591 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 9592 return nextMBB; 9593} 9594 9595// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 9596// or XMM0_V32I8 in AVX all of this code can be replaced with that 9597// in the .td file. 9598MachineBasicBlock * 9599X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 9600 unsigned numArgs, bool memArg) const { 9601 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 9602 "Target must have SSE4.2 or AVX features enabled"); 9603 9604 DebugLoc dl = MI->getDebugLoc(); 9605 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9606 unsigned Opc; 9607 if (!Subtarget->hasAVX()) { 9608 if (memArg) 9609 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 9610 else 9611 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 9612 } else { 9613 if (memArg) 9614 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 9615 else 9616 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 9617 } 9618 9619 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 9620 for (unsigned i = 0; i < numArgs; ++i) { 9621 MachineOperand &Op = MI->getOperand(i+1); 9622 if (!(Op.isReg() && Op.isImplicit())) 9623 MIB.addOperand(Op); 9624 } 9625 BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 9626 .addReg(X86::XMM0); 9627 9628 MI->eraseFromParent(); 9629 return BB; 9630} 9631 9632MachineBasicBlock * 9633X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 9634 DebugLoc dl = MI->getDebugLoc(); 9635 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9636 9637 // Address into RAX/EAX, other two args into ECX, EDX. 9638 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 9639 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 9640 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 9641 for (int i = 0; i < X86::AddrNumOperands; ++i) 9642 MIB.addOperand(MI->getOperand(i)); 9643 9644 unsigned ValOps = X86::AddrNumOperands; 9645 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 9646 .addReg(MI->getOperand(ValOps).getReg()); 9647 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 9648 .addReg(MI->getOperand(ValOps+1).getReg()); 9649 9650 // The instruction doesn't actually take any operands though. 9651 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 9652 9653 MI->eraseFromParent(); // The pseudo is gone now. 9654 return BB; 9655} 9656 9657MachineBasicBlock * 9658X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 9659 DebugLoc dl = MI->getDebugLoc(); 9660 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9661 9662 // First arg in ECX, the second in EAX. 9663 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 9664 .addReg(MI->getOperand(0).getReg()); 9665 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 9666 .addReg(MI->getOperand(1).getReg()); 9667 9668 // The instruction doesn't actually take any operands though. 9669 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 9670 9671 MI->eraseFromParent(); // The pseudo is gone now. 9672 return BB; 9673} 9674 9675MachineBasicBlock * 9676X86TargetLowering::EmitVAARG64WithCustomInserter( 9677 MachineInstr *MI, 9678 MachineBasicBlock *MBB) const { 9679 // Emit va_arg instruction on X86-64. 9680 9681 // Operands to this pseudo-instruction: 9682 // 0 ) Output : destination address (reg) 9683 // 1-5) Input : va_list address (addr, i64mem) 9684 // 6 ) ArgSize : Size (in bytes) of vararg type 9685 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 9686 // 8 ) Align : Alignment of type 9687 // 9 ) EFLAGS (implicit-def) 9688 9689 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 9690 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 9691 9692 unsigned DestReg = MI->getOperand(0).getReg(); 9693 MachineOperand &Base = MI->getOperand(1); 9694 MachineOperand &Scale = MI->getOperand(2); 9695 MachineOperand &Index = MI->getOperand(3); 9696 MachineOperand &Disp = MI->getOperand(4); 9697 MachineOperand &Segment = MI->getOperand(5); 9698 unsigned ArgSize = MI->getOperand(6).getImm(); 9699 unsigned ArgMode = MI->getOperand(7).getImm(); 9700 unsigned Align = MI->getOperand(8).getImm(); 9701 9702 // Memory Reference 9703 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 9704 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 9705 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 9706 9707 // Machine Information 9708 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9709 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 9710 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 9711 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 9712 DebugLoc DL = MI->getDebugLoc(); 9713 9714 // struct va_list { 9715 // i32 gp_offset 9716 // i32 fp_offset 9717 // i64 overflow_area (address) 9718 // i64 reg_save_area (address) 9719 // } 9720 // sizeof(va_list) = 24 9721 // alignment(va_list) = 8 9722 9723 unsigned TotalNumIntRegs = 6; 9724 unsigned TotalNumXMMRegs = 8; 9725 bool UseGPOffset = (ArgMode == 1); 9726 bool UseFPOffset = (ArgMode == 2); 9727 unsigned MaxOffset = TotalNumIntRegs * 8 + 9728 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 9729 9730 /* Align ArgSize to a multiple of 8 */ 9731 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 9732 bool NeedsAlign = (Align > 8); 9733 9734 MachineBasicBlock *thisMBB = MBB; 9735 MachineBasicBlock *overflowMBB; 9736 MachineBasicBlock *offsetMBB; 9737 MachineBasicBlock *endMBB; 9738 9739 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 9740 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 9741 unsigned OffsetReg = 0; 9742 9743 if (!UseGPOffset && !UseFPOffset) { 9744 // If we only pull from the overflow region, we don't create a branch. 9745 // We don't need to alter control flow. 9746 OffsetDestReg = 0; // unused 9747 OverflowDestReg = DestReg; 9748 9749 offsetMBB = NULL; 9750 overflowMBB = thisMBB; 9751 endMBB = thisMBB; 9752 } else { 9753 // First emit code to check if gp_offset (or fp_offset) is below the bound. 9754 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 9755 // If not, pull from overflow_area. (branch to overflowMBB) 9756 // 9757 // thisMBB 9758 // | . 9759 // | . 9760 // offsetMBB overflowMBB 9761 // | . 9762 // | . 9763 // endMBB 9764 9765 // Registers for the PHI in endMBB 9766 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 9767 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 9768 9769 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9770 MachineFunction *MF = MBB->getParent(); 9771 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9772 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9773 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9774 9775 MachineFunction::iterator MBBIter = MBB; 9776 ++MBBIter; 9777 9778 // Insert the new basic blocks 9779 MF->insert(MBBIter, offsetMBB); 9780 MF->insert(MBBIter, overflowMBB); 9781 MF->insert(MBBIter, endMBB); 9782 9783 // Transfer the remainder of MBB and its successor edges to endMBB. 9784 endMBB->splice(endMBB->begin(), thisMBB, 9785 llvm::next(MachineBasicBlock::iterator(MI)), 9786 thisMBB->end()); 9787 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9788 9789 // Make offsetMBB and overflowMBB successors of thisMBB 9790 thisMBB->addSuccessor(offsetMBB); 9791 thisMBB->addSuccessor(overflowMBB); 9792 9793 // endMBB is a successor of both offsetMBB and overflowMBB 9794 offsetMBB->addSuccessor(endMBB); 9795 overflowMBB->addSuccessor(endMBB); 9796 9797 // Load the offset value into a register 9798 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 9799 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 9800 .addOperand(Base) 9801 .addOperand(Scale) 9802 .addOperand(Index) 9803 .addDisp(Disp, UseFPOffset ? 4 : 0) 9804 .addOperand(Segment) 9805 .setMemRefs(MMOBegin, MMOEnd); 9806 9807 // Check if there is enough room left to pull this argument. 9808 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 9809 .addReg(OffsetReg) 9810 .addImm(MaxOffset + 8 - ArgSizeA8); 9811 9812 // Branch to "overflowMBB" if offset >= max 9813 // Fall through to "offsetMBB" otherwise 9814 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 9815 .addMBB(overflowMBB); 9816 } 9817 9818 // In offsetMBB, emit code to use the reg_save_area. 9819 if (offsetMBB) { 9820 assert(OffsetReg != 0); 9821 9822 // Read the reg_save_area address. 9823 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 9824 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 9825 .addOperand(Base) 9826 .addOperand(Scale) 9827 .addOperand(Index) 9828 .addDisp(Disp, 16) 9829 .addOperand(Segment) 9830 .setMemRefs(MMOBegin, MMOEnd); 9831 9832 // Zero-extend the offset 9833 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 9834 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 9835 .addImm(0) 9836 .addReg(OffsetReg) 9837 .addImm(X86::sub_32bit); 9838 9839 // Add the offset to the reg_save_area to get the final address. 9840 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 9841 .addReg(OffsetReg64) 9842 .addReg(RegSaveReg); 9843 9844 // Compute the offset for the next argument 9845 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 9846 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 9847 .addReg(OffsetReg) 9848 .addImm(UseFPOffset ? 16 : 8); 9849 9850 // Store it back into the va_list. 9851 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 9852 .addOperand(Base) 9853 .addOperand(Scale) 9854 .addOperand(Index) 9855 .addDisp(Disp, UseFPOffset ? 4 : 0) 9856 .addOperand(Segment) 9857 .addReg(NextOffsetReg) 9858 .setMemRefs(MMOBegin, MMOEnd); 9859 9860 // Jump to endMBB 9861 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 9862 .addMBB(endMBB); 9863 } 9864 9865 // 9866 // Emit code to use overflow area 9867 // 9868 9869 // Load the overflow_area address into a register. 9870 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 9871 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 9872 .addOperand(Base) 9873 .addOperand(Scale) 9874 .addOperand(Index) 9875 .addDisp(Disp, 8) 9876 .addOperand(Segment) 9877 .setMemRefs(MMOBegin, MMOEnd); 9878 9879 // If we need to align it, do so. Otherwise, just copy the address 9880 // to OverflowDestReg. 9881 if (NeedsAlign) { 9882 // Align the overflow address 9883 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 9884 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 9885 9886 // aligned_addr = (addr + (align-1)) & ~(align-1) 9887 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 9888 .addReg(OverflowAddrReg) 9889 .addImm(Align-1); 9890 9891 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 9892 .addReg(TmpReg) 9893 .addImm(~(uint64_t)(Align-1)); 9894 } else { 9895 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 9896 .addReg(OverflowAddrReg); 9897 } 9898 9899 // Compute the next overflow address after this argument. 9900 // (the overflow address should be kept 8-byte aligned) 9901 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 9902 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 9903 .addReg(OverflowDestReg) 9904 .addImm(ArgSizeA8); 9905 9906 // Store the new overflow address. 9907 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 9908 .addOperand(Base) 9909 .addOperand(Scale) 9910 .addOperand(Index) 9911 .addDisp(Disp, 8) 9912 .addOperand(Segment) 9913 .addReg(NextAddrReg) 9914 .setMemRefs(MMOBegin, MMOEnd); 9915 9916 // If we branched, emit the PHI to the front of endMBB. 9917 if (offsetMBB) { 9918 BuildMI(*endMBB, endMBB->begin(), DL, 9919 TII->get(X86::PHI), DestReg) 9920 .addReg(OffsetDestReg).addMBB(offsetMBB) 9921 .addReg(OverflowDestReg).addMBB(overflowMBB); 9922 } 9923 9924 // Erase the pseudo instruction 9925 MI->eraseFromParent(); 9926 9927 return endMBB; 9928} 9929 9930MachineBasicBlock * 9931X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 9932 MachineInstr *MI, 9933 MachineBasicBlock *MBB) const { 9934 // Emit code to save XMM registers to the stack. The ABI says that the 9935 // number of registers to save is given in %al, so it's theoretically 9936 // possible to do an indirect jump trick to avoid saving all of them, 9937 // however this code takes a simpler approach and just executes all 9938 // of the stores if %al is non-zero. It's less code, and it's probably 9939 // easier on the hardware branch predictor, and stores aren't all that 9940 // expensive anyway. 9941 9942 // Create the new basic blocks. One block contains all the XMM stores, 9943 // and one block is the final destination regardless of whether any 9944 // stores were performed. 9945 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9946 MachineFunction *F = MBB->getParent(); 9947 MachineFunction::iterator MBBIter = MBB; 9948 ++MBBIter; 9949 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 9950 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 9951 F->insert(MBBIter, XMMSaveMBB); 9952 F->insert(MBBIter, EndMBB); 9953 9954 // Transfer the remainder of MBB and its successor edges to EndMBB. 9955 EndMBB->splice(EndMBB->begin(), MBB, 9956 llvm::next(MachineBasicBlock::iterator(MI)), 9957 MBB->end()); 9958 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 9959 9960 // The original block will now fall through to the XMM save block. 9961 MBB->addSuccessor(XMMSaveMBB); 9962 // The XMMSaveMBB will fall through to the end block. 9963 XMMSaveMBB->addSuccessor(EndMBB); 9964 9965 // Now add the instructions. 9966 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9967 DebugLoc DL = MI->getDebugLoc(); 9968 9969 unsigned CountReg = MI->getOperand(0).getReg(); 9970 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 9971 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 9972 9973 if (!Subtarget->isTargetWin64()) { 9974 // If %al is 0, branch around the XMM save block. 9975 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 9976 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 9977 MBB->addSuccessor(EndMBB); 9978 } 9979 9980 // In the XMM save block, save all the XMM argument registers. 9981 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 9982 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 9983 MachineMemOperand *MMO = 9984 F->getMachineMemOperand( 9985 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 9986 MachineMemOperand::MOStore, 9987 /*Size=*/16, /*Align=*/16); 9988 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 9989 .addFrameIndex(RegSaveFrameIndex) 9990 .addImm(/*Scale=*/1) 9991 .addReg(/*IndexReg=*/0) 9992 .addImm(/*Disp=*/Offset) 9993 .addReg(/*Segment=*/0) 9994 .addReg(MI->getOperand(i).getReg()) 9995 .addMemOperand(MMO); 9996 } 9997 9998 MI->eraseFromParent(); // The pseudo instruction is gone now. 9999 10000 return EndMBB; 10001} 10002 10003MachineBasicBlock * 10004X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 10005 MachineBasicBlock *BB) const { 10006 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10007 DebugLoc DL = MI->getDebugLoc(); 10008 10009 // To "insert" a SELECT_CC instruction, we actually have to insert the 10010 // diamond control-flow pattern. The incoming instruction knows the 10011 // destination vreg to set, the condition code register to branch on, the 10012 // true/false values to select between, and a branch opcode to use. 10013 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10014 MachineFunction::iterator It = BB; 10015 ++It; 10016 10017 // thisMBB: 10018 // ... 10019 // TrueVal = ... 10020 // cmpTY ccX, r1, r2 10021 // bCC copy1MBB 10022 // fallthrough --> copy0MBB 10023 MachineBasicBlock *thisMBB = BB; 10024 MachineFunction *F = BB->getParent(); 10025 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10026 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10027 F->insert(It, copy0MBB); 10028 F->insert(It, sinkMBB); 10029 10030 // If the EFLAGS register isn't dead in the terminator, then claim that it's 10031 // live into the sink and copy blocks. 10032 const MachineFunction *MF = BB->getParent(); 10033 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 10034 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 10035 10036 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 10037 const MachineOperand &MO = MI->getOperand(I); 10038 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 10039 unsigned Reg = MO.getReg(); 10040 if (Reg != X86::EFLAGS) continue; 10041 copy0MBB->addLiveIn(Reg); 10042 sinkMBB->addLiveIn(Reg); 10043 } 10044 10045 // Transfer the remainder of BB and its successor edges to sinkMBB. 10046 sinkMBB->splice(sinkMBB->begin(), BB, 10047 llvm::next(MachineBasicBlock::iterator(MI)), 10048 BB->end()); 10049 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10050 10051 // Add the true and fallthrough blocks as its successors. 10052 BB->addSuccessor(copy0MBB); 10053 BB->addSuccessor(sinkMBB); 10054 10055 // Create the conditional branch instruction. 10056 unsigned Opc = 10057 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 10058 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 10059 10060 // copy0MBB: 10061 // %FalseValue = ... 10062 // # fallthrough to sinkMBB 10063 copy0MBB->addSuccessor(sinkMBB); 10064 10065 // sinkMBB: 10066 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10067 // ... 10068 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 10069 TII->get(X86::PHI), MI->getOperand(0).getReg()) 10070 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 10071 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 10072 10073 MI->eraseFromParent(); // The pseudo instruction is gone now. 10074 return sinkMBB; 10075} 10076 10077MachineBasicBlock * 10078X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 10079 MachineBasicBlock *BB) const { 10080 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10081 DebugLoc DL = MI->getDebugLoc(); 10082 10083 // The lowering is pretty easy: we're just emitting the call to _alloca. The 10084 // non-trivial part is impdef of ESP. 10085 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 10086 // mingw-w64. 10087 10088 const char *StackProbeSymbol = 10089 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 10090 10091 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 10092 .addExternalSymbol(StackProbeSymbol) 10093 .addReg(X86::EAX, RegState::Implicit) 10094 .addReg(X86::ESP, RegState::Implicit) 10095 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 10096 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 10097 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10098 10099 MI->eraseFromParent(); // The pseudo instruction is gone now. 10100 return BB; 10101} 10102 10103MachineBasicBlock * 10104X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 10105 MachineBasicBlock *BB) const { 10106 // This is pretty easy. We're taking the value that we received from 10107 // our load from the relocation, sticking it in either RDI (x86-64) 10108 // or EAX and doing an indirect call. The return value will then 10109 // be in the normal return register. 10110 const X86InstrInfo *TII 10111 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 10112 DebugLoc DL = MI->getDebugLoc(); 10113 MachineFunction *F = BB->getParent(); 10114 10115 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 10116 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 10117 10118 if (Subtarget->is64Bit()) { 10119 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10120 TII->get(X86::MOV64rm), X86::RDI) 10121 .addReg(X86::RIP) 10122 .addImm(0).addReg(0) 10123 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10124 MI->getOperand(3).getTargetFlags()) 10125 .addReg(0); 10126 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 10127 addDirectMem(MIB, X86::RDI); 10128 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 10129 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10130 TII->get(X86::MOV32rm), X86::EAX) 10131 .addReg(0) 10132 .addImm(0).addReg(0) 10133 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10134 MI->getOperand(3).getTargetFlags()) 10135 .addReg(0); 10136 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10137 addDirectMem(MIB, X86::EAX); 10138 } else { 10139 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10140 TII->get(X86::MOV32rm), X86::EAX) 10141 .addReg(TII->getGlobalBaseReg(F)) 10142 .addImm(0).addReg(0) 10143 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10144 MI->getOperand(3).getTargetFlags()) 10145 .addReg(0); 10146 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10147 addDirectMem(MIB, X86::EAX); 10148 } 10149 10150 MI->eraseFromParent(); // The pseudo instruction is gone now. 10151 return BB; 10152} 10153 10154MachineBasicBlock * 10155X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 10156 MachineBasicBlock *BB) const { 10157 switch (MI->getOpcode()) { 10158 default: assert(false && "Unexpected instr type to insert"); 10159 case X86::TAILJMPd64: 10160 case X86::TAILJMPr64: 10161 case X86::TAILJMPm64: 10162 assert(!"TAILJMP64 would not be touched here."); 10163 case X86::TCRETURNdi64: 10164 case X86::TCRETURNri64: 10165 case X86::TCRETURNmi64: 10166 // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. 10167 // On AMD64, additional defs should be added before register allocation. 10168 if (!Subtarget->isTargetWin64()) { 10169 MI->addRegisterDefined(X86::RSI); 10170 MI->addRegisterDefined(X86::RDI); 10171 MI->addRegisterDefined(X86::XMM6); 10172 MI->addRegisterDefined(X86::XMM7); 10173 MI->addRegisterDefined(X86::XMM8); 10174 MI->addRegisterDefined(X86::XMM9); 10175 MI->addRegisterDefined(X86::XMM10); 10176 MI->addRegisterDefined(X86::XMM11); 10177 MI->addRegisterDefined(X86::XMM12); 10178 MI->addRegisterDefined(X86::XMM13); 10179 MI->addRegisterDefined(X86::XMM14); 10180 MI->addRegisterDefined(X86::XMM15); 10181 } 10182 return BB; 10183 case X86::WIN_ALLOCA: 10184 return EmitLoweredWinAlloca(MI, BB); 10185 case X86::TLSCall_32: 10186 case X86::TLSCall_64: 10187 return EmitLoweredTLSCall(MI, BB); 10188 case X86::CMOV_GR8: 10189 case X86::CMOV_FR32: 10190 case X86::CMOV_FR64: 10191 case X86::CMOV_V4F32: 10192 case X86::CMOV_V2F64: 10193 case X86::CMOV_V2I64: 10194 case X86::CMOV_GR16: 10195 case X86::CMOV_GR32: 10196 case X86::CMOV_RFP32: 10197 case X86::CMOV_RFP64: 10198 case X86::CMOV_RFP80: 10199 return EmitLoweredSelect(MI, BB); 10200 10201 case X86::FP32_TO_INT16_IN_MEM: 10202 case X86::FP32_TO_INT32_IN_MEM: 10203 case X86::FP32_TO_INT64_IN_MEM: 10204 case X86::FP64_TO_INT16_IN_MEM: 10205 case X86::FP64_TO_INT32_IN_MEM: 10206 case X86::FP64_TO_INT64_IN_MEM: 10207 case X86::FP80_TO_INT16_IN_MEM: 10208 case X86::FP80_TO_INT32_IN_MEM: 10209 case X86::FP80_TO_INT64_IN_MEM: { 10210 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10211 DebugLoc DL = MI->getDebugLoc(); 10212 10213 // Change the floating point control register to use "round towards zero" 10214 // mode when truncating to an integer value. 10215 MachineFunction *F = BB->getParent(); 10216 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 10217 addFrameReference(BuildMI(*BB, MI, DL, 10218 TII->get(X86::FNSTCW16m)), CWFrameIdx); 10219 10220 // Load the old value of the high byte of the control word... 10221 unsigned OldCW = 10222 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 10223 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 10224 CWFrameIdx); 10225 10226 // Set the high part to be round to zero... 10227 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 10228 .addImm(0xC7F); 10229 10230 // Reload the modified control word now... 10231 addFrameReference(BuildMI(*BB, MI, DL, 10232 TII->get(X86::FLDCW16m)), CWFrameIdx); 10233 10234 // Restore the memory image of control word to original value 10235 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 10236 .addReg(OldCW); 10237 10238 // Get the X86 opcode to use. 10239 unsigned Opc; 10240 switch (MI->getOpcode()) { 10241 default: llvm_unreachable("illegal opcode!"); 10242 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 10243 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 10244 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 10245 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 10246 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 10247 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 10248 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 10249 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 10250 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 10251 } 10252 10253 X86AddressMode AM; 10254 MachineOperand &Op = MI->getOperand(0); 10255 if (Op.isReg()) { 10256 AM.BaseType = X86AddressMode::RegBase; 10257 AM.Base.Reg = Op.getReg(); 10258 } else { 10259 AM.BaseType = X86AddressMode::FrameIndexBase; 10260 AM.Base.FrameIndex = Op.getIndex(); 10261 } 10262 Op = MI->getOperand(1); 10263 if (Op.isImm()) 10264 AM.Scale = Op.getImm(); 10265 Op = MI->getOperand(2); 10266 if (Op.isImm()) 10267 AM.IndexReg = Op.getImm(); 10268 Op = MI->getOperand(3); 10269 if (Op.isGlobal()) { 10270 AM.GV = Op.getGlobal(); 10271 } else { 10272 AM.Disp = Op.getImm(); 10273 } 10274 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 10275 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 10276 10277 // Reload the original control word now. 10278 addFrameReference(BuildMI(*BB, MI, DL, 10279 TII->get(X86::FLDCW16m)), CWFrameIdx); 10280 10281 MI->eraseFromParent(); // The pseudo instruction is gone now. 10282 return BB; 10283 } 10284 // String/text processing lowering. 10285 case X86::PCMPISTRM128REG: 10286 case X86::VPCMPISTRM128REG: 10287 return EmitPCMP(MI, BB, 3, false /* in-mem */); 10288 case X86::PCMPISTRM128MEM: 10289 case X86::VPCMPISTRM128MEM: 10290 return EmitPCMP(MI, BB, 3, true /* in-mem */); 10291 case X86::PCMPESTRM128REG: 10292 case X86::VPCMPESTRM128REG: 10293 return EmitPCMP(MI, BB, 5, false /* in mem */); 10294 case X86::PCMPESTRM128MEM: 10295 case X86::VPCMPESTRM128MEM: 10296 return EmitPCMP(MI, BB, 5, true /* in mem */); 10297 10298 // Thread synchronization. 10299 case X86::MONITOR: 10300 return EmitMonitor(MI, BB); 10301 case X86::MWAIT: 10302 return EmitMwait(MI, BB); 10303 10304 // Atomic Lowering. 10305 case X86::ATOMAND32: 10306 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10307 X86::AND32ri, X86::MOV32rm, 10308 X86::LCMPXCHG32, 10309 X86::NOT32r, X86::EAX, 10310 X86::GR32RegisterClass); 10311 case X86::ATOMOR32: 10312 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 10313 X86::OR32ri, X86::MOV32rm, 10314 X86::LCMPXCHG32, 10315 X86::NOT32r, X86::EAX, 10316 X86::GR32RegisterClass); 10317 case X86::ATOMXOR32: 10318 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 10319 X86::XOR32ri, X86::MOV32rm, 10320 X86::LCMPXCHG32, 10321 X86::NOT32r, X86::EAX, 10322 X86::GR32RegisterClass); 10323 case X86::ATOMNAND32: 10324 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10325 X86::AND32ri, X86::MOV32rm, 10326 X86::LCMPXCHG32, 10327 X86::NOT32r, X86::EAX, 10328 X86::GR32RegisterClass, true); 10329 case X86::ATOMMIN32: 10330 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 10331 case X86::ATOMMAX32: 10332 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 10333 case X86::ATOMUMIN32: 10334 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 10335 case X86::ATOMUMAX32: 10336 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 10337 10338 case X86::ATOMAND16: 10339 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10340 X86::AND16ri, X86::MOV16rm, 10341 X86::LCMPXCHG16, 10342 X86::NOT16r, X86::AX, 10343 X86::GR16RegisterClass); 10344 case X86::ATOMOR16: 10345 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 10346 X86::OR16ri, X86::MOV16rm, 10347 X86::LCMPXCHG16, 10348 X86::NOT16r, X86::AX, 10349 X86::GR16RegisterClass); 10350 case X86::ATOMXOR16: 10351 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 10352 X86::XOR16ri, X86::MOV16rm, 10353 X86::LCMPXCHG16, 10354 X86::NOT16r, X86::AX, 10355 X86::GR16RegisterClass); 10356 case X86::ATOMNAND16: 10357 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10358 X86::AND16ri, X86::MOV16rm, 10359 X86::LCMPXCHG16, 10360 X86::NOT16r, X86::AX, 10361 X86::GR16RegisterClass, true); 10362 case X86::ATOMMIN16: 10363 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 10364 case X86::ATOMMAX16: 10365 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 10366 case X86::ATOMUMIN16: 10367 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 10368 case X86::ATOMUMAX16: 10369 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 10370 10371 case X86::ATOMAND8: 10372 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10373 X86::AND8ri, X86::MOV8rm, 10374 X86::LCMPXCHG8, 10375 X86::NOT8r, X86::AL, 10376 X86::GR8RegisterClass); 10377 case X86::ATOMOR8: 10378 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 10379 X86::OR8ri, X86::MOV8rm, 10380 X86::LCMPXCHG8, 10381 X86::NOT8r, X86::AL, 10382 X86::GR8RegisterClass); 10383 case X86::ATOMXOR8: 10384 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 10385 X86::XOR8ri, X86::MOV8rm, 10386 X86::LCMPXCHG8, 10387 X86::NOT8r, X86::AL, 10388 X86::GR8RegisterClass); 10389 case X86::ATOMNAND8: 10390 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10391 X86::AND8ri, X86::MOV8rm, 10392 X86::LCMPXCHG8, 10393 X86::NOT8r, X86::AL, 10394 X86::GR8RegisterClass, true); 10395 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 10396 // This group is for 64-bit host. 10397 case X86::ATOMAND64: 10398 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10399 X86::AND64ri32, X86::MOV64rm, 10400 X86::LCMPXCHG64, 10401 X86::NOT64r, X86::RAX, 10402 X86::GR64RegisterClass); 10403 case X86::ATOMOR64: 10404 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 10405 X86::OR64ri32, X86::MOV64rm, 10406 X86::LCMPXCHG64, 10407 X86::NOT64r, X86::RAX, 10408 X86::GR64RegisterClass); 10409 case X86::ATOMXOR64: 10410 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 10411 X86::XOR64ri32, X86::MOV64rm, 10412 X86::LCMPXCHG64, 10413 X86::NOT64r, X86::RAX, 10414 X86::GR64RegisterClass); 10415 case X86::ATOMNAND64: 10416 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10417 X86::AND64ri32, X86::MOV64rm, 10418 X86::LCMPXCHG64, 10419 X86::NOT64r, X86::RAX, 10420 X86::GR64RegisterClass, true); 10421 case X86::ATOMMIN64: 10422 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 10423 case X86::ATOMMAX64: 10424 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 10425 case X86::ATOMUMIN64: 10426 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 10427 case X86::ATOMUMAX64: 10428 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 10429 10430 // This group does 64-bit operations on a 32-bit host. 10431 case X86::ATOMAND6432: 10432 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10433 X86::AND32rr, X86::AND32rr, 10434 X86::AND32ri, X86::AND32ri, 10435 false); 10436 case X86::ATOMOR6432: 10437 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10438 X86::OR32rr, X86::OR32rr, 10439 X86::OR32ri, X86::OR32ri, 10440 false); 10441 case X86::ATOMXOR6432: 10442 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10443 X86::XOR32rr, X86::XOR32rr, 10444 X86::XOR32ri, X86::XOR32ri, 10445 false); 10446 case X86::ATOMNAND6432: 10447 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10448 X86::AND32rr, X86::AND32rr, 10449 X86::AND32ri, X86::AND32ri, 10450 true); 10451 case X86::ATOMADD6432: 10452 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10453 X86::ADD32rr, X86::ADC32rr, 10454 X86::ADD32ri, X86::ADC32ri, 10455 false); 10456 case X86::ATOMSUB6432: 10457 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10458 X86::SUB32rr, X86::SBB32rr, 10459 X86::SUB32ri, X86::SBB32ri, 10460 false); 10461 case X86::ATOMSWAP6432: 10462 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10463 X86::MOV32rr, X86::MOV32rr, 10464 X86::MOV32ri, X86::MOV32ri, 10465 false); 10466 case X86::VASTART_SAVE_XMM_REGS: 10467 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 10468 10469 case X86::VAARG_64: 10470 return EmitVAARG64WithCustomInserter(MI, BB); 10471 } 10472} 10473 10474//===----------------------------------------------------------------------===// 10475// X86 Optimization Hooks 10476//===----------------------------------------------------------------------===// 10477 10478void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 10479 const APInt &Mask, 10480 APInt &KnownZero, 10481 APInt &KnownOne, 10482 const SelectionDAG &DAG, 10483 unsigned Depth) const { 10484 unsigned Opc = Op.getOpcode(); 10485 assert((Opc >= ISD::BUILTIN_OP_END || 10486 Opc == ISD::INTRINSIC_WO_CHAIN || 10487 Opc == ISD::INTRINSIC_W_CHAIN || 10488 Opc == ISD::INTRINSIC_VOID) && 10489 "Should use MaskedValueIsZero if you don't know whether Op" 10490 " is a target node!"); 10491 10492 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 10493 switch (Opc) { 10494 default: break; 10495 case X86ISD::ADD: 10496 case X86ISD::SUB: 10497 case X86ISD::ADC: 10498 case X86ISD::SBB: 10499 case X86ISD::SMUL: 10500 case X86ISD::UMUL: 10501 case X86ISD::INC: 10502 case X86ISD::DEC: 10503 case X86ISD::OR: 10504 case X86ISD::XOR: 10505 case X86ISD::AND: 10506 // These nodes' second result is a boolean. 10507 if (Op.getResNo() == 0) 10508 break; 10509 // Fallthrough 10510 case X86ISD::SETCC: 10511 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 10512 Mask.getBitWidth() - 1); 10513 break; 10514 } 10515} 10516 10517unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 10518 unsigned Depth) const { 10519 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 10520 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 10521 return Op.getValueType().getScalarType().getSizeInBits(); 10522 10523 // Fallback case. 10524 return 1; 10525} 10526 10527/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 10528/// node is a GlobalAddress + offset. 10529bool X86TargetLowering::isGAPlusOffset(SDNode *N, 10530 const GlobalValue* &GA, 10531 int64_t &Offset) const { 10532 if (N->getOpcode() == X86ISD::Wrapper) { 10533 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 10534 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 10535 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 10536 return true; 10537 } 10538 } 10539 return TargetLowering::isGAPlusOffset(N, GA, Offset); 10540} 10541 10542/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 10543/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 10544/// if the load addresses are consecutive, non-overlapping, and in the right 10545/// order. 10546static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 10547 TargetLowering::DAGCombinerInfo &DCI) { 10548 DebugLoc dl = N->getDebugLoc(); 10549 EVT VT = N->getValueType(0); 10550 10551 if (VT.getSizeInBits() != 128) 10552 return SDValue(); 10553 10554 // Don't create instructions with illegal types after legalize types has run. 10555 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10556 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 10557 return SDValue(); 10558 10559 SmallVector<SDValue, 16> Elts; 10560 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 10561 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 10562 10563 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 10564} 10565 10566/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 10567/// generation and convert it from being a bunch of shuffles and extracts 10568/// to a simple store and scalar loads to extract the elements. 10569static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 10570 const TargetLowering &TLI) { 10571 SDValue InputVector = N->getOperand(0); 10572 10573 // Only operate on vectors of 4 elements, where the alternative shuffling 10574 // gets to be more expensive. 10575 if (InputVector.getValueType() != MVT::v4i32) 10576 return SDValue(); 10577 10578 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 10579 // single use which is a sign-extend or zero-extend, and all elements are 10580 // used. 10581 SmallVector<SDNode *, 4> Uses; 10582 unsigned ExtractedElements = 0; 10583 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 10584 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 10585 if (UI.getUse().getResNo() != InputVector.getResNo()) 10586 return SDValue(); 10587 10588 SDNode *Extract = *UI; 10589 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 10590 return SDValue(); 10591 10592 if (Extract->getValueType(0) != MVT::i32) 10593 return SDValue(); 10594 if (!Extract->hasOneUse()) 10595 return SDValue(); 10596 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 10597 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 10598 return SDValue(); 10599 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 10600 return SDValue(); 10601 10602 // Record which element was extracted. 10603 ExtractedElements |= 10604 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 10605 10606 Uses.push_back(Extract); 10607 } 10608 10609 // If not all the elements were used, this may not be worthwhile. 10610 if (ExtractedElements != 15) 10611 return SDValue(); 10612 10613 // Ok, we've now decided to do the transformation. 10614 DebugLoc dl = InputVector.getDebugLoc(); 10615 10616 // Store the value to a temporary stack slot. 10617 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 10618 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 10619 MachinePointerInfo(), false, false, 0); 10620 10621 // Replace each use (extract) with a load of the appropriate element. 10622 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 10623 UE = Uses.end(); UI != UE; ++UI) { 10624 SDNode *Extract = *UI; 10625 10626 // Compute the element's address. 10627 SDValue Idx = Extract->getOperand(1); 10628 unsigned EltSize = 10629 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 10630 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 10631 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 10632 10633 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 10634 StackPtr, OffsetVal); 10635 10636 // Load the scalar. 10637 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 10638 ScalarAddr, MachinePointerInfo(), 10639 false, false, 0); 10640 10641 // Replace the exact with the load. 10642 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 10643 } 10644 10645 // The replacement was made in place; don't return anything. 10646 return SDValue(); 10647} 10648 10649/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 10650static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 10651 const X86Subtarget *Subtarget) { 10652 DebugLoc DL = N->getDebugLoc(); 10653 SDValue Cond = N->getOperand(0); 10654 // Get the LHS/RHS of the select. 10655 SDValue LHS = N->getOperand(1); 10656 SDValue RHS = N->getOperand(2); 10657 10658 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 10659 // instructions match the semantics of the common C idiom x<y?x:y but not 10660 // x<=y?x:y, because of how they handle negative zero (which can be 10661 // ignored in unsafe-math mode). 10662 if (Subtarget->hasSSE2() && 10663 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 10664 Cond.getOpcode() == ISD::SETCC) { 10665 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 10666 10667 unsigned Opcode = 0; 10668 // Check for x CC y ? x : y. 10669 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 10670 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 10671 switch (CC) { 10672 default: break; 10673 case ISD::SETULT: 10674 // Converting this to a min would handle NaNs incorrectly, and swapping 10675 // the operands would cause it to handle comparisons between positive 10676 // and negative zero incorrectly. 10677 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10678 if (!UnsafeFPMath && 10679 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10680 break; 10681 std::swap(LHS, RHS); 10682 } 10683 Opcode = X86ISD::FMIN; 10684 break; 10685 case ISD::SETOLE: 10686 // Converting this to a min would handle comparisons between positive 10687 // and negative zero incorrectly. 10688 if (!UnsafeFPMath && 10689 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 10690 break; 10691 Opcode = X86ISD::FMIN; 10692 break; 10693 case ISD::SETULE: 10694 // Converting this to a min would handle both negative zeros and NaNs 10695 // incorrectly, but we can swap the operands to fix both. 10696 std::swap(LHS, RHS); 10697 case ISD::SETOLT: 10698 case ISD::SETLT: 10699 case ISD::SETLE: 10700 Opcode = X86ISD::FMIN; 10701 break; 10702 10703 case ISD::SETOGE: 10704 // Converting this to a max would handle comparisons between positive 10705 // and negative zero incorrectly. 10706 if (!UnsafeFPMath && 10707 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 10708 break; 10709 Opcode = X86ISD::FMAX; 10710 break; 10711 case ISD::SETUGT: 10712 // Converting this to a max would handle NaNs incorrectly, and swapping 10713 // the operands would cause it to handle comparisons between positive 10714 // and negative zero incorrectly. 10715 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10716 if (!UnsafeFPMath && 10717 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10718 break; 10719 std::swap(LHS, RHS); 10720 } 10721 Opcode = X86ISD::FMAX; 10722 break; 10723 case ISD::SETUGE: 10724 // Converting this to a max would handle both negative zeros and NaNs 10725 // incorrectly, but we can swap the operands to fix both. 10726 std::swap(LHS, RHS); 10727 case ISD::SETOGT: 10728 case ISD::SETGT: 10729 case ISD::SETGE: 10730 Opcode = X86ISD::FMAX; 10731 break; 10732 } 10733 // Check for x CC y ? y : x -- a min/max with reversed arms. 10734 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 10735 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 10736 switch (CC) { 10737 default: break; 10738 case ISD::SETOGE: 10739 // Converting this to a min would handle comparisons between positive 10740 // and negative zero incorrectly, and swapping the operands would 10741 // cause it to handle NaNs incorrectly. 10742 if (!UnsafeFPMath && 10743 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 10744 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10745 break; 10746 std::swap(LHS, RHS); 10747 } 10748 Opcode = X86ISD::FMIN; 10749 break; 10750 case ISD::SETUGT: 10751 // Converting this to a min would handle NaNs incorrectly. 10752 if (!UnsafeFPMath && 10753 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 10754 break; 10755 Opcode = X86ISD::FMIN; 10756 break; 10757 case ISD::SETUGE: 10758 // Converting this to a min would handle both negative zeros and NaNs 10759 // incorrectly, but we can swap the operands to fix both. 10760 std::swap(LHS, RHS); 10761 case ISD::SETOGT: 10762 case ISD::SETGT: 10763 case ISD::SETGE: 10764 Opcode = X86ISD::FMIN; 10765 break; 10766 10767 case ISD::SETULT: 10768 // Converting this to a max would handle NaNs incorrectly. 10769 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10770 break; 10771 Opcode = X86ISD::FMAX; 10772 break; 10773 case ISD::SETOLE: 10774 // Converting this to a max would handle comparisons between positive 10775 // and negative zero incorrectly, and swapping the operands would 10776 // cause it to handle NaNs incorrectly. 10777 if (!UnsafeFPMath && 10778 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 10779 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10780 break; 10781 std::swap(LHS, RHS); 10782 } 10783 Opcode = X86ISD::FMAX; 10784 break; 10785 case ISD::SETULE: 10786 // Converting this to a max would handle both negative zeros and NaNs 10787 // incorrectly, but we can swap the operands to fix both. 10788 std::swap(LHS, RHS); 10789 case ISD::SETOLT: 10790 case ISD::SETLT: 10791 case ISD::SETLE: 10792 Opcode = X86ISD::FMAX; 10793 break; 10794 } 10795 } 10796 10797 if (Opcode) 10798 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 10799 } 10800 10801 // If this is a select between two integer constants, try to do some 10802 // optimizations. 10803 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 10804 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 10805 // Don't do this for crazy integer types. 10806 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 10807 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 10808 // so that TrueC (the true value) is larger than FalseC. 10809 bool NeedsCondInvert = false; 10810 10811 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 10812 // Efficiently invertible. 10813 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 10814 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 10815 isa<ConstantSDNode>(Cond.getOperand(1))))) { 10816 NeedsCondInvert = true; 10817 std::swap(TrueC, FalseC); 10818 } 10819 10820 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 10821 if (FalseC->getAPIntValue() == 0 && 10822 TrueC->getAPIntValue().isPowerOf2()) { 10823 if (NeedsCondInvert) // Invert the condition if needed. 10824 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10825 DAG.getConstant(1, Cond.getValueType())); 10826 10827 // Zero extend the condition if needed. 10828 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 10829 10830 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10831 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 10832 DAG.getConstant(ShAmt, MVT::i8)); 10833 } 10834 10835 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 10836 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10837 if (NeedsCondInvert) // Invert the condition if needed. 10838 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10839 DAG.getConstant(1, Cond.getValueType())); 10840 10841 // Zero extend the condition if needed. 10842 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10843 FalseC->getValueType(0), Cond); 10844 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10845 SDValue(FalseC, 0)); 10846 } 10847 10848 // Optimize cases that will turn into an LEA instruction. This requires 10849 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10850 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10851 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10852 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10853 10854 bool isFastMultiplier = false; 10855 if (Diff < 10) { 10856 switch ((unsigned char)Diff) { 10857 default: break; 10858 case 1: // result = add base, cond 10859 case 2: // result = lea base( , cond*2) 10860 case 3: // result = lea base(cond, cond*2) 10861 case 4: // result = lea base( , cond*4) 10862 case 5: // result = lea base(cond, cond*4) 10863 case 8: // result = lea base( , cond*8) 10864 case 9: // result = lea base(cond, cond*8) 10865 isFastMultiplier = true; 10866 break; 10867 } 10868 } 10869 10870 if (isFastMultiplier) { 10871 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10872 if (NeedsCondInvert) // Invert the condition if needed. 10873 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10874 DAG.getConstant(1, Cond.getValueType())); 10875 10876 // Zero extend the condition if needed. 10877 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10878 Cond); 10879 // Scale the condition by the difference. 10880 if (Diff != 1) 10881 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10882 DAG.getConstant(Diff, Cond.getValueType())); 10883 10884 // Add the base if non-zero. 10885 if (FalseC->getAPIntValue() != 0) 10886 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10887 SDValue(FalseC, 0)); 10888 return Cond; 10889 } 10890 } 10891 } 10892 } 10893 10894 return SDValue(); 10895} 10896 10897/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 10898static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 10899 TargetLowering::DAGCombinerInfo &DCI) { 10900 DebugLoc DL = N->getDebugLoc(); 10901 10902 // If the flag operand isn't dead, don't touch this CMOV. 10903 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 10904 return SDValue(); 10905 10906 // If this is a select between two integer constants, try to do some 10907 // optimizations. Note that the operands are ordered the opposite of SELECT 10908 // operands. 10909 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 10910 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 10911 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 10912 // larger than FalseC (the false value). 10913 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 10914 10915 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 10916 CC = X86::GetOppositeBranchCondition(CC); 10917 std::swap(TrueC, FalseC); 10918 } 10919 10920 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 10921 // This is efficient for any integer data type (including i8/i16) and 10922 // shift amount. 10923 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 10924 SDValue Cond = N->getOperand(3); 10925 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10926 DAG.getConstant(CC, MVT::i8), Cond); 10927 10928 // Zero extend the condition if needed. 10929 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 10930 10931 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10932 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 10933 DAG.getConstant(ShAmt, MVT::i8)); 10934 if (N->getNumValues() == 2) // Dead flag value? 10935 return DCI.CombineTo(N, Cond, SDValue()); 10936 return Cond; 10937 } 10938 10939 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 10940 // for any integer data type, including i8/i16. 10941 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10942 SDValue Cond = N->getOperand(3); 10943 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10944 DAG.getConstant(CC, MVT::i8), Cond); 10945 10946 // Zero extend the condition if needed. 10947 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10948 FalseC->getValueType(0), Cond); 10949 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10950 SDValue(FalseC, 0)); 10951 10952 if (N->getNumValues() == 2) // Dead flag value? 10953 return DCI.CombineTo(N, Cond, SDValue()); 10954 return Cond; 10955 } 10956 10957 // Optimize cases that will turn into an LEA instruction. This requires 10958 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10959 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10960 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10961 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10962 10963 bool isFastMultiplier = false; 10964 if (Diff < 10) { 10965 switch ((unsigned char)Diff) { 10966 default: break; 10967 case 1: // result = add base, cond 10968 case 2: // result = lea base( , cond*2) 10969 case 3: // result = lea base(cond, cond*2) 10970 case 4: // result = lea base( , cond*4) 10971 case 5: // result = lea base(cond, cond*4) 10972 case 8: // result = lea base( , cond*8) 10973 case 9: // result = lea base(cond, cond*8) 10974 isFastMultiplier = true; 10975 break; 10976 } 10977 } 10978 10979 if (isFastMultiplier) { 10980 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10981 SDValue Cond = N->getOperand(3); 10982 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10983 DAG.getConstant(CC, MVT::i8), Cond); 10984 // Zero extend the condition if needed. 10985 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10986 Cond); 10987 // Scale the condition by the difference. 10988 if (Diff != 1) 10989 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10990 DAG.getConstant(Diff, Cond.getValueType())); 10991 10992 // Add the base if non-zero. 10993 if (FalseC->getAPIntValue() != 0) 10994 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10995 SDValue(FalseC, 0)); 10996 if (N->getNumValues() == 2) // Dead flag value? 10997 return DCI.CombineTo(N, Cond, SDValue()); 10998 return Cond; 10999 } 11000 } 11001 } 11002 } 11003 return SDValue(); 11004} 11005 11006 11007/// PerformMulCombine - Optimize a single multiply with constant into two 11008/// in order to implement it with two cheaper instructions, e.g. 11009/// LEA + SHL, LEA + LEA. 11010static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 11011 TargetLowering::DAGCombinerInfo &DCI) { 11012 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11013 return SDValue(); 11014 11015 EVT VT = N->getValueType(0); 11016 if (VT != MVT::i64) 11017 return SDValue(); 11018 11019 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11020 if (!C) 11021 return SDValue(); 11022 uint64_t MulAmt = C->getZExtValue(); 11023 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 11024 return SDValue(); 11025 11026 uint64_t MulAmt1 = 0; 11027 uint64_t MulAmt2 = 0; 11028 if ((MulAmt % 9) == 0) { 11029 MulAmt1 = 9; 11030 MulAmt2 = MulAmt / 9; 11031 } else if ((MulAmt % 5) == 0) { 11032 MulAmt1 = 5; 11033 MulAmt2 = MulAmt / 5; 11034 } else if ((MulAmt % 3) == 0) { 11035 MulAmt1 = 3; 11036 MulAmt2 = MulAmt / 3; 11037 } 11038 if (MulAmt2 && 11039 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 11040 DebugLoc DL = N->getDebugLoc(); 11041 11042 if (isPowerOf2_64(MulAmt2) && 11043 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 11044 // If second multiplifer is pow2, issue it first. We want the multiply by 11045 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 11046 // is an add. 11047 std::swap(MulAmt1, MulAmt2); 11048 11049 SDValue NewMul; 11050 if (isPowerOf2_64(MulAmt1)) 11051 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 11052 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 11053 else 11054 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 11055 DAG.getConstant(MulAmt1, VT)); 11056 11057 if (isPowerOf2_64(MulAmt2)) 11058 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 11059 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 11060 else 11061 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 11062 DAG.getConstant(MulAmt2, VT)); 11063 11064 // Do not add new nodes to DAG combiner worklist. 11065 DCI.CombineTo(N, NewMul, false); 11066 } 11067 return SDValue(); 11068} 11069 11070static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 11071 SDValue N0 = N->getOperand(0); 11072 SDValue N1 = N->getOperand(1); 11073 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 11074 EVT VT = N0.getValueType(); 11075 11076 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 11077 // since the result of setcc_c is all zero's or all ones. 11078 if (N1C && N0.getOpcode() == ISD::AND && 11079 N0.getOperand(1).getOpcode() == ISD::Constant) { 11080 SDValue N00 = N0.getOperand(0); 11081 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 11082 ((N00.getOpcode() == ISD::ANY_EXTEND || 11083 N00.getOpcode() == ISD::ZERO_EXTEND) && 11084 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 11085 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 11086 APInt ShAmt = N1C->getAPIntValue(); 11087 Mask = Mask.shl(ShAmt); 11088 if (Mask != 0) 11089 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 11090 N00, DAG.getConstant(Mask, VT)); 11091 } 11092 } 11093 11094 return SDValue(); 11095} 11096 11097/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 11098/// when possible. 11099static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 11100 const X86Subtarget *Subtarget) { 11101 EVT VT = N->getValueType(0); 11102 if (!VT.isVector() && VT.isInteger() && 11103 N->getOpcode() == ISD::SHL) 11104 return PerformSHLCombine(N, DAG); 11105 11106 // On X86 with SSE2 support, we can transform this to a vector shift if 11107 // all elements are shifted by the same amount. We can't do this in legalize 11108 // because the a constant vector is typically transformed to a constant pool 11109 // so we have no knowledge of the shift amount. 11110 if (!Subtarget->hasSSE2()) 11111 return SDValue(); 11112 11113 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 11114 return SDValue(); 11115 11116 SDValue ShAmtOp = N->getOperand(1); 11117 EVT EltVT = VT.getVectorElementType(); 11118 DebugLoc DL = N->getDebugLoc(); 11119 SDValue BaseShAmt = SDValue(); 11120 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 11121 unsigned NumElts = VT.getVectorNumElements(); 11122 unsigned i = 0; 11123 for (; i != NumElts; ++i) { 11124 SDValue Arg = ShAmtOp.getOperand(i); 11125 if (Arg.getOpcode() == ISD::UNDEF) continue; 11126 BaseShAmt = Arg; 11127 break; 11128 } 11129 for (; i != NumElts; ++i) { 11130 SDValue Arg = ShAmtOp.getOperand(i); 11131 if (Arg.getOpcode() == ISD::UNDEF) continue; 11132 if (Arg != BaseShAmt) { 11133 return SDValue(); 11134 } 11135 } 11136 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 11137 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 11138 SDValue InVec = ShAmtOp.getOperand(0); 11139 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 11140 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 11141 unsigned i = 0; 11142 for (; i != NumElts; ++i) { 11143 SDValue Arg = InVec.getOperand(i); 11144 if (Arg.getOpcode() == ISD::UNDEF) continue; 11145 BaseShAmt = Arg; 11146 break; 11147 } 11148 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 11149 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 11150 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 11151 if (C->getZExtValue() == SplatIdx) 11152 BaseShAmt = InVec.getOperand(1); 11153 } 11154 } 11155 if (BaseShAmt.getNode() == 0) 11156 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 11157 DAG.getIntPtrConstant(0)); 11158 } else 11159 return SDValue(); 11160 11161 // The shift amount is an i32. 11162 if (EltVT.bitsGT(MVT::i32)) 11163 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 11164 else if (EltVT.bitsLT(MVT::i32)) 11165 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 11166 11167 // The shift amount is identical so we can do a vector shift. 11168 SDValue ValOp = N->getOperand(0); 11169 switch (N->getOpcode()) { 11170 default: 11171 llvm_unreachable("Unknown shift opcode!"); 11172 break; 11173 case ISD::SHL: 11174 if (VT == MVT::v2i64) 11175 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11176 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 11177 ValOp, BaseShAmt); 11178 if (VT == MVT::v4i32) 11179 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11180 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 11181 ValOp, BaseShAmt); 11182 if (VT == MVT::v8i16) 11183 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11184 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 11185 ValOp, BaseShAmt); 11186 break; 11187 case ISD::SRA: 11188 if (VT == MVT::v4i32) 11189 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11190 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 11191 ValOp, BaseShAmt); 11192 if (VT == MVT::v8i16) 11193 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11194 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 11195 ValOp, BaseShAmt); 11196 break; 11197 case ISD::SRL: 11198 if (VT == MVT::v2i64) 11199 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11200 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 11201 ValOp, BaseShAmt); 11202 if (VT == MVT::v4i32) 11203 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11204 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 11205 ValOp, BaseShAmt); 11206 if (VT == MVT::v8i16) 11207 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11208 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 11209 ValOp, BaseShAmt); 11210 break; 11211 } 11212 return SDValue(); 11213} 11214 11215 11216static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 11217 TargetLowering::DAGCombinerInfo &DCI, 11218 const X86Subtarget *Subtarget) { 11219 if (DCI.isBeforeLegalizeOps()) 11220 return SDValue(); 11221 11222 // Want to form PANDN nodes, in the hopes of then easily combining them with 11223 // OR and AND nodes to form PBLEND/PSIGN. 11224 EVT VT = N->getValueType(0); 11225 if (VT != MVT::v2i64) 11226 return SDValue(); 11227 11228 SDValue N0 = N->getOperand(0); 11229 SDValue N1 = N->getOperand(1); 11230 DebugLoc DL = N->getDebugLoc(); 11231 11232 // Check LHS for vnot 11233 if (N0.getOpcode() == ISD::XOR && 11234 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 11235 return DAG.getNode(X86ISD::PANDN, DL, VT, N0.getOperand(0), N1); 11236 11237 // Check RHS for vnot 11238 if (N1.getOpcode() == ISD::XOR && 11239 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 11240 return DAG.getNode(X86ISD::PANDN, DL, VT, N1.getOperand(0), N0); 11241 11242 return SDValue(); 11243} 11244 11245static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 11246 TargetLowering::DAGCombinerInfo &DCI, 11247 const X86Subtarget *Subtarget) { 11248 if (DCI.isBeforeLegalizeOps()) 11249 return SDValue(); 11250 11251 EVT VT = N->getValueType(0); 11252 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) 11253 return SDValue(); 11254 11255 SDValue N0 = N->getOperand(0); 11256 SDValue N1 = N->getOperand(1); 11257 11258 // look for psign/blend 11259 if (Subtarget->hasSSSE3()) { 11260 if (VT == MVT::v2i64) { 11261 // Canonicalize pandn to RHS 11262 if (N0.getOpcode() == X86ISD::PANDN) 11263 std::swap(N0, N1); 11264 // or (and (m, x), (pandn m, y)) 11265 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::PANDN) { 11266 SDValue Mask = N1.getOperand(0); 11267 SDValue X = N1.getOperand(1); 11268 SDValue Y; 11269 if (N0.getOperand(0) == Mask) 11270 Y = N0.getOperand(1); 11271 if (N0.getOperand(1) == Mask) 11272 Y = N0.getOperand(0); 11273 11274 // Check to see if the mask appeared in both the AND and PANDN and 11275 if (!Y.getNode()) 11276 return SDValue(); 11277 11278 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 11279 if (Mask.getOpcode() != ISD::BITCAST || 11280 X.getOpcode() != ISD::BITCAST || 11281 Y.getOpcode() != ISD::BITCAST) 11282 return SDValue(); 11283 11284 // Look through mask bitcast. 11285 Mask = Mask.getOperand(0); 11286 EVT MaskVT = Mask.getValueType(); 11287 11288 // Validate that the Mask operand is a vector sra node. The sra node 11289 // will be an intrinsic. 11290 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 11291 return SDValue(); 11292 11293 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 11294 // there is no psrai.b 11295 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 11296 case Intrinsic::x86_sse2_psrai_w: 11297 case Intrinsic::x86_sse2_psrai_d: 11298 break; 11299 default: return SDValue(); 11300 } 11301 11302 // Check that the SRA is all signbits. 11303 SDValue SraC = Mask.getOperand(2); 11304 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 11305 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 11306 if ((SraAmt + 1) != EltBits) 11307 return SDValue(); 11308 11309 DebugLoc DL = N->getDebugLoc(); 11310 11311 // Now we know we at least have a plendvb with the mask val. See if 11312 // we can form a psignb/w/d. 11313 // psign = x.type == y.type == mask.type && y = sub(0, x); 11314 X = X.getOperand(0); 11315 Y = Y.getOperand(0); 11316 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 11317 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 11318 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ 11319 unsigned Opc = 0; 11320 switch (EltBits) { 11321 case 8: Opc = X86ISD::PSIGNB; break; 11322 case 16: Opc = X86ISD::PSIGNW; break; 11323 case 32: Opc = X86ISD::PSIGND; break; 11324 default: break; 11325 } 11326 if (Opc) { 11327 SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); 11328 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); 11329 } 11330 } 11331 // PBLENDVB only available on SSE 4.1 11332 if (!Subtarget->hasSSE41()) 11333 return SDValue(); 11334 11335 X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); 11336 Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); 11337 Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); 11338 Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); 11339 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); 11340 } 11341 } 11342 } 11343 11344 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 11345 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 11346 std::swap(N0, N1); 11347 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 11348 return SDValue(); 11349 if (!N0.hasOneUse() || !N1.hasOneUse()) 11350 return SDValue(); 11351 11352 SDValue ShAmt0 = N0.getOperand(1); 11353 if (ShAmt0.getValueType() != MVT::i8) 11354 return SDValue(); 11355 SDValue ShAmt1 = N1.getOperand(1); 11356 if (ShAmt1.getValueType() != MVT::i8) 11357 return SDValue(); 11358 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 11359 ShAmt0 = ShAmt0.getOperand(0); 11360 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 11361 ShAmt1 = ShAmt1.getOperand(0); 11362 11363 DebugLoc DL = N->getDebugLoc(); 11364 unsigned Opc = X86ISD::SHLD; 11365 SDValue Op0 = N0.getOperand(0); 11366 SDValue Op1 = N1.getOperand(0); 11367 if (ShAmt0.getOpcode() == ISD::SUB) { 11368 Opc = X86ISD::SHRD; 11369 std::swap(Op0, Op1); 11370 std::swap(ShAmt0, ShAmt1); 11371 } 11372 11373 unsigned Bits = VT.getSizeInBits(); 11374 if (ShAmt1.getOpcode() == ISD::SUB) { 11375 SDValue Sum = ShAmt1.getOperand(0); 11376 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 11377 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 11378 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 11379 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 11380 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 11381 return DAG.getNode(Opc, DL, VT, 11382 Op0, Op1, 11383 DAG.getNode(ISD::TRUNCATE, DL, 11384 MVT::i8, ShAmt0)); 11385 } 11386 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 11387 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 11388 if (ShAmt0C && 11389 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 11390 return DAG.getNode(Opc, DL, VT, 11391 N0.getOperand(0), N1.getOperand(0), 11392 DAG.getNode(ISD::TRUNCATE, DL, 11393 MVT::i8, ShAmt0)); 11394 } 11395 11396 return SDValue(); 11397} 11398 11399/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 11400static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 11401 const X86Subtarget *Subtarget) { 11402 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 11403 // the FP state in cases where an emms may be missing. 11404 // A preferable solution to the general problem is to figure out the right 11405 // places to insert EMMS. This qualifies as a quick hack. 11406 11407 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 11408 StoreSDNode *St = cast<StoreSDNode>(N); 11409 EVT VT = St->getValue().getValueType(); 11410 if (VT.getSizeInBits() != 64) 11411 return SDValue(); 11412 11413 const Function *F = DAG.getMachineFunction().getFunction(); 11414 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 11415 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 11416 && Subtarget->hasSSE2(); 11417 if ((VT.isVector() || 11418 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 11419 isa<LoadSDNode>(St->getValue()) && 11420 !cast<LoadSDNode>(St->getValue())->isVolatile() && 11421 St->getChain().hasOneUse() && !St->isVolatile()) { 11422 SDNode* LdVal = St->getValue().getNode(); 11423 LoadSDNode *Ld = 0; 11424 int TokenFactorIndex = -1; 11425 SmallVector<SDValue, 8> Ops; 11426 SDNode* ChainVal = St->getChain().getNode(); 11427 // Must be a store of a load. We currently handle two cases: the load 11428 // is a direct child, and it's under an intervening TokenFactor. It is 11429 // possible to dig deeper under nested TokenFactors. 11430 if (ChainVal == LdVal) 11431 Ld = cast<LoadSDNode>(St->getChain()); 11432 else if (St->getValue().hasOneUse() && 11433 ChainVal->getOpcode() == ISD::TokenFactor) { 11434 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 11435 if (ChainVal->getOperand(i).getNode() == LdVal) { 11436 TokenFactorIndex = i; 11437 Ld = cast<LoadSDNode>(St->getValue()); 11438 } else 11439 Ops.push_back(ChainVal->getOperand(i)); 11440 } 11441 } 11442 11443 if (!Ld || !ISD::isNormalLoad(Ld)) 11444 return SDValue(); 11445 11446 // If this is not the MMX case, i.e. we are just turning i64 load/store 11447 // into f64 load/store, avoid the transformation if there are multiple 11448 // uses of the loaded value. 11449 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 11450 return SDValue(); 11451 11452 DebugLoc LdDL = Ld->getDebugLoc(); 11453 DebugLoc StDL = N->getDebugLoc(); 11454 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 11455 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 11456 // pair instead. 11457 if (Subtarget->is64Bit() || F64IsLegal) { 11458 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 11459 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 11460 Ld->getPointerInfo(), Ld->isVolatile(), 11461 Ld->isNonTemporal(), Ld->getAlignment()); 11462 SDValue NewChain = NewLd.getValue(1); 11463 if (TokenFactorIndex != -1) { 11464 Ops.push_back(NewChain); 11465 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11466 Ops.size()); 11467 } 11468 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 11469 St->getPointerInfo(), 11470 St->isVolatile(), St->isNonTemporal(), 11471 St->getAlignment()); 11472 } 11473 11474 // Otherwise, lower to two pairs of 32-bit loads / stores. 11475 SDValue LoAddr = Ld->getBasePtr(); 11476 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 11477 DAG.getConstant(4, MVT::i32)); 11478 11479 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 11480 Ld->getPointerInfo(), 11481 Ld->isVolatile(), Ld->isNonTemporal(), 11482 Ld->getAlignment()); 11483 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 11484 Ld->getPointerInfo().getWithOffset(4), 11485 Ld->isVolatile(), Ld->isNonTemporal(), 11486 MinAlign(Ld->getAlignment(), 4)); 11487 11488 SDValue NewChain = LoLd.getValue(1); 11489 if (TokenFactorIndex != -1) { 11490 Ops.push_back(LoLd); 11491 Ops.push_back(HiLd); 11492 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11493 Ops.size()); 11494 } 11495 11496 LoAddr = St->getBasePtr(); 11497 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 11498 DAG.getConstant(4, MVT::i32)); 11499 11500 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 11501 St->getPointerInfo(), 11502 St->isVolatile(), St->isNonTemporal(), 11503 St->getAlignment()); 11504 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 11505 St->getPointerInfo().getWithOffset(4), 11506 St->isVolatile(), 11507 St->isNonTemporal(), 11508 MinAlign(St->getAlignment(), 4)); 11509 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 11510 } 11511 return SDValue(); 11512} 11513 11514/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 11515/// X86ISD::FXOR nodes. 11516static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 11517 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 11518 // F[X]OR(0.0, x) -> x 11519 // F[X]OR(x, 0.0) -> x 11520 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11521 if (C->getValueAPF().isPosZero()) 11522 return N->getOperand(1); 11523 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11524 if (C->getValueAPF().isPosZero()) 11525 return N->getOperand(0); 11526 return SDValue(); 11527} 11528 11529/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 11530static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 11531 // FAND(0.0, x) -> 0.0 11532 // FAND(x, 0.0) -> 0.0 11533 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11534 if (C->getValueAPF().isPosZero()) 11535 return N->getOperand(0); 11536 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11537 if (C->getValueAPF().isPosZero()) 11538 return N->getOperand(1); 11539 return SDValue(); 11540} 11541 11542static SDValue PerformBTCombine(SDNode *N, 11543 SelectionDAG &DAG, 11544 TargetLowering::DAGCombinerInfo &DCI) { 11545 // BT ignores high bits in the bit index operand. 11546 SDValue Op1 = N->getOperand(1); 11547 if (Op1.hasOneUse()) { 11548 unsigned BitWidth = Op1.getValueSizeInBits(); 11549 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 11550 APInt KnownZero, KnownOne; 11551 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 11552 !DCI.isBeforeLegalizeOps()); 11553 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11554 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 11555 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 11556 DCI.CommitTargetLoweringOpt(TLO); 11557 } 11558 return SDValue(); 11559} 11560 11561static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 11562 SDValue Op = N->getOperand(0); 11563 if (Op.getOpcode() == ISD::BITCAST) 11564 Op = Op.getOperand(0); 11565 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 11566 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 11567 VT.getVectorElementType().getSizeInBits() == 11568 OpVT.getVectorElementType().getSizeInBits()) { 11569 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 11570 } 11571 return SDValue(); 11572} 11573 11574static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 11575 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 11576 // (and (i32 x86isd::setcc_carry), 1) 11577 // This eliminates the zext. This transformation is necessary because 11578 // ISD::SETCC is always legalized to i8. 11579 DebugLoc dl = N->getDebugLoc(); 11580 SDValue N0 = N->getOperand(0); 11581 EVT VT = N->getValueType(0); 11582 if (N0.getOpcode() == ISD::AND && 11583 N0.hasOneUse() && 11584 N0.getOperand(0).hasOneUse()) { 11585 SDValue N00 = N0.getOperand(0); 11586 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 11587 return SDValue(); 11588 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 11589 if (!C || C->getZExtValue() != 1) 11590 return SDValue(); 11591 return DAG.getNode(ISD::AND, dl, VT, 11592 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 11593 N00.getOperand(0), N00.getOperand(1)), 11594 DAG.getConstant(1, VT)); 11595 } 11596 11597 return SDValue(); 11598} 11599 11600// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 11601static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 11602 unsigned X86CC = N->getConstantOperandVal(0); 11603 SDValue EFLAG = N->getOperand(1); 11604 DebugLoc DL = N->getDebugLoc(); 11605 11606 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 11607 // a zext and produces an all-ones bit which is more useful than 0/1 in some 11608 // cases. 11609 if (X86CC == X86::COND_B) 11610 return DAG.getNode(ISD::AND, DL, MVT::i8, 11611 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 11612 DAG.getConstant(X86CC, MVT::i8), EFLAG), 11613 DAG.getConstant(1, MVT::i8)); 11614 11615 return SDValue(); 11616} 11617 11618// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 11619static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 11620 X86TargetLowering::DAGCombinerInfo &DCI) { 11621 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 11622 // the result is either zero or one (depending on the input carry bit). 11623 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 11624 if (X86::isZeroNode(N->getOperand(0)) && 11625 X86::isZeroNode(N->getOperand(1)) && 11626 // We don't have a good way to replace an EFLAGS use, so only do this when 11627 // dead right now. 11628 SDValue(N, 1).use_empty()) { 11629 DebugLoc DL = N->getDebugLoc(); 11630 EVT VT = N->getValueType(0); 11631 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 11632 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 11633 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 11634 DAG.getConstant(X86::COND_B,MVT::i8), 11635 N->getOperand(2)), 11636 DAG.getConstant(1, VT)); 11637 return DCI.CombineTo(N, Res1, CarryOut); 11638 } 11639 11640 return SDValue(); 11641} 11642 11643// fold (add Y, (sete X, 0)) -> adc 0, Y 11644// (add Y, (setne X, 0)) -> sbb -1, Y 11645// (sub (sete X, 0), Y) -> sbb 0, Y 11646// (sub (setne X, 0), Y) -> adc -1, Y 11647static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) { 11648 DebugLoc DL = N->getDebugLoc(); 11649 11650 // Look through ZExts. 11651 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 11652 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 11653 return SDValue(); 11654 11655 SDValue SetCC = Ext.getOperand(0); 11656 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 11657 return SDValue(); 11658 11659 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 11660 if (CC != X86::COND_E && CC != X86::COND_NE) 11661 return SDValue(); 11662 11663 SDValue Cmp = SetCC.getOperand(1); 11664 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 11665 !X86::isZeroNode(Cmp.getOperand(1)) || 11666 !Cmp.getOperand(0).getValueType().isInteger()) 11667 return SDValue(); 11668 11669 SDValue CmpOp0 = Cmp.getOperand(0); 11670 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 11671 DAG.getConstant(1, CmpOp0.getValueType())); 11672 11673 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 11674 if (CC == X86::COND_NE) 11675 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 11676 DL, OtherVal.getValueType(), OtherVal, 11677 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 11678 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 11679 DL, OtherVal.getValueType(), OtherVal, 11680 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 11681} 11682 11683SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 11684 DAGCombinerInfo &DCI) const { 11685 SelectionDAG &DAG = DCI.DAG; 11686 switch (N->getOpcode()) { 11687 default: break; 11688 case ISD::EXTRACT_VECTOR_ELT: 11689 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 11690 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 11691 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 11692 case ISD::ADD: 11693 case ISD::SUB: return OptimizeConditonalInDecrement(N, DAG); 11694 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 11695 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 11696 case ISD::SHL: 11697 case ISD::SRA: 11698 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 11699 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 11700 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 11701 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 11702 case X86ISD::FXOR: 11703 case X86ISD::FOR: return PerformFORCombine(N, DAG); 11704 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 11705 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 11706 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 11707 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 11708 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 11709 case X86ISD::SHUFPS: // Handle all target specific shuffles 11710 case X86ISD::SHUFPD: 11711 case X86ISD::PALIGN: 11712 case X86ISD::PUNPCKHBW: 11713 case X86ISD::PUNPCKHWD: 11714 case X86ISD::PUNPCKHDQ: 11715 case X86ISD::PUNPCKHQDQ: 11716 case X86ISD::UNPCKHPS: 11717 case X86ISD::UNPCKHPD: 11718 case X86ISD::PUNPCKLBW: 11719 case X86ISD::PUNPCKLWD: 11720 case X86ISD::PUNPCKLDQ: 11721 case X86ISD::PUNPCKLQDQ: 11722 case X86ISD::UNPCKLPS: 11723 case X86ISD::UNPCKLPD: 11724 case X86ISD::MOVHLPS: 11725 case X86ISD::MOVLHPS: 11726 case X86ISD::PSHUFD: 11727 case X86ISD::PSHUFHW: 11728 case X86ISD::PSHUFLW: 11729 case X86ISD::MOVSS: 11730 case X86ISD::MOVSD: 11731 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); 11732 } 11733 11734 return SDValue(); 11735} 11736 11737/// isTypeDesirableForOp - Return true if the target has native support for 11738/// the specified value type and it is 'desirable' to use the type for the 11739/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 11740/// instruction encodings are longer and some i16 instructions are slow. 11741bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 11742 if (!isTypeLegal(VT)) 11743 return false; 11744 if (VT != MVT::i16) 11745 return true; 11746 11747 switch (Opc) { 11748 default: 11749 return true; 11750 case ISD::LOAD: 11751 case ISD::SIGN_EXTEND: 11752 case ISD::ZERO_EXTEND: 11753 case ISD::ANY_EXTEND: 11754 case ISD::SHL: 11755 case ISD::SRL: 11756 case ISD::SUB: 11757 case ISD::ADD: 11758 case ISD::MUL: 11759 case ISD::AND: 11760 case ISD::OR: 11761 case ISD::XOR: 11762 return false; 11763 } 11764} 11765 11766/// IsDesirableToPromoteOp - This method query the target whether it is 11767/// beneficial for dag combiner to promote the specified node. If true, it 11768/// should return the desired promotion type by reference. 11769bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 11770 EVT VT = Op.getValueType(); 11771 if (VT != MVT::i16) 11772 return false; 11773 11774 bool Promote = false; 11775 bool Commute = false; 11776 switch (Op.getOpcode()) { 11777 default: break; 11778 case ISD::LOAD: { 11779 LoadSDNode *LD = cast<LoadSDNode>(Op); 11780 // If the non-extending load has a single use and it's not live out, then it 11781 // might be folded. 11782 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 11783 Op.hasOneUse()*/) { 11784 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 11785 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 11786 // The only case where we'd want to promote LOAD (rather then it being 11787 // promoted as an operand is when it's only use is liveout. 11788 if (UI->getOpcode() != ISD::CopyToReg) 11789 return false; 11790 } 11791 } 11792 Promote = true; 11793 break; 11794 } 11795 case ISD::SIGN_EXTEND: 11796 case ISD::ZERO_EXTEND: 11797 case ISD::ANY_EXTEND: 11798 Promote = true; 11799 break; 11800 case ISD::SHL: 11801 case ISD::SRL: { 11802 SDValue N0 = Op.getOperand(0); 11803 // Look out for (store (shl (load), x)). 11804 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 11805 return false; 11806 Promote = true; 11807 break; 11808 } 11809 case ISD::ADD: 11810 case ISD::MUL: 11811 case ISD::AND: 11812 case ISD::OR: 11813 case ISD::XOR: 11814 Commute = true; 11815 // fallthrough 11816 case ISD::SUB: { 11817 SDValue N0 = Op.getOperand(0); 11818 SDValue N1 = Op.getOperand(1); 11819 if (!Commute && MayFoldLoad(N1)) 11820 return false; 11821 // Avoid disabling potential load folding opportunities. 11822 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 11823 return false; 11824 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 11825 return false; 11826 Promote = true; 11827 } 11828 } 11829 11830 PVT = MVT::i32; 11831 return Promote; 11832} 11833 11834//===----------------------------------------------------------------------===// 11835// X86 Inline Assembly Support 11836//===----------------------------------------------------------------------===// 11837 11838bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 11839 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 11840 11841 std::string AsmStr = IA->getAsmString(); 11842 11843 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 11844 SmallVector<StringRef, 4> AsmPieces; 11845 SplitString(AsmStr, AsmPieces, ";\n"); 11846 11847 switch (AsmPieces.size()) { 11848 default: return false; 11849 case 1: 11850 AsmStr = AsmPieces[0]; 11851 AsmPieces.clear(); 11852 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 11853 11854 // FIXME: this should verify that we are targetting a 486 or better. If not, 11855 // we will turn this bswap into something that will be lowered to logical ops 11856 // instead of emitting the bswap asm. For now, we don't support 486 or lower 11857 // so don't worry about this. 11858 // bswap $0 11859 if (AsmPieces.size() == 2 && 11860 (AsmPieces[0] == "bswap" || 11861 AsmPieces[0] == "bswapq" || 11862 AsmPieces[0] == "bswapl") && 11863 (AsmPieces[1] == "$0" || 11864 AsmPieces[1] == "${0:q}")) { 11865 // No need to check constraints, nothing other than the equivalent of 11866 // "=r,0" would be valid here. 11867 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11868 if (!Ty || Ty->getBitWidth() % 16 != 0) 11869 return false; 11870 return IntrinsicLowering::LowerToByteSwap(CI); 11871 } 11872 // rorw $$8, ${0:w} --> llvm.bswap.i16 11873 if (CI->getType()->isIntegerTy(16) && 11874 AsmPieces.size() == 3 && 11875 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 11876 AsmPieces[1] == "$$8," && 11877 AsmPieces[2] == "${0:w}" && 11878 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 11879 AsmPieces.clear(); 11880 const std::string &ConstraintsStr = IA->getConstraintString(); 11881 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 11882 std::sort(AsmPieces.begin(), AsmPieces.end()); 11883 if (AsmPieces.size() == 4 && 11884 AsmPieces[0] == "~{cc}" && 11885 AsmPieces[1] == "~{dirflag}" && 11886 AsmPieces[2] == "~{flags}" && 11887 AsmPieces[3] == "~{fpsr}") { 11888 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11889 if (!Ty || Ty->getBitWidth() % 16 != 0) 11890 return false; 11891 return IntrinsicLowering::LowerToByteSwap(CI); 11892 } 11893 } 11894 break; 11895 case 3: 11896 if (CI->getType()->isIntegerTy(32) && 11897 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 11898 SmallVector<StringRef, 4> Words; 11899 SplitString(AsmPieces[0], Words, " \t,"); 11900 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 11901 Words[2] == "${0:w}") { 11902 Words.clear(); 11903 SplitString(AsmPieces[1], Words, " \t,"); 11904 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 11905 Words[2] == "$0") { 11906 Words.clear(); 11907 SplitString(AsmPieces[2], Words, " \t,"); 11908 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 11909 Words[2] == "${0:w}") { 11910 AsmPieces.clear(); 11911 const std::string &ConstraintsStr = IA->getConstraintString(); 11912 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 11913 std::sort(AsmPieces.begin(), AsmPieces.end()); 11914 if (AsmPieces.size() == 4 && 11915 AsmPieces[0] == "~{cc}" && 11916 AsmPieces[1] == "~{dirflag}" && 11917 AsmPieces[2] == "~{flags}" && 11918 AsmPieces[3] == "~{fpsr}") { 11919 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11920 if (!Ty || Ty->getBitWidth() % 16 != 0) 11921 return false; 11922 return IntrinsicLowering::LowerToByteSwap(CI); 11923 } 11924 } 11925 } 11926 } 11927 } 11928 11929 if (CI->getType()->isIntegerTy(64)) { 11930 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 11931 if (Constraints.size() >= 2 && 11932 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 11933 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 11934 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 11935 SmallVector<StringRef, 4> Words; 11936 SplitString(AsmPieces[0], Words, " \t"); 11937 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 11938 Words.clear(); 11939 SplitString(AsmPieces[1], Words, " \t"); 11940 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 11941 Words.clear(); 11942 SplitString(AsmPieces[2], Words, " \t,"); 11943 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 11944 Words[2] == "%edx") { 11945 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11946 if (!Ty || Ty->getBitWidth() % 16 != 0) 11947 return false; 11948 return IntrinsicLowering::LowerToByteSwap(CI); 11949 } 11950 } 11951 } 11952 } 11953 } 11954 break; 11955 } 11956 return false; 11957} 11958 11959 11960 11961/// getConstraintType - Given a constraint letter, return the type of 11962/// constraint it is for this target. 11963X86TargetLowering::ConstraintType 11964X86TargetLowering::getConstraintType(const std::string &Constraint) const { 11965 if (Constraint.size() == 1) { 11966 switch (Constraint[0]) { 11967 case 'R': 11968 case 'q': 11969 case 'Q': 11970 case 'f': 11971 case 't': 11972 case 'u': 11973 case 'y': 11974 case 'x': 11975 case 'Y': 11976 return C_RegisterClass; 11977 case 'a': 11978 case 'b': 11979 case 'c': 11980 case 'd': 11981 case 'S': 11982 case 'D': 11983 case 'A': 11984 return C_Register; 11985 case 'I': 11986 case 'J': 11987 case 'K': 11988 case 'L': 11989 case 'M': 11990 case 'N': 11991 case 'G': 11992 case 'C': 11993 case 'e': 11994 case 'Z': 11995 return C_Other; 11996 default: 11997 break; 11998 } 11999 } 12000 return TargetLowering::getConstraintType(Constraint); 12001} 12002 12003/// Examine constraint type and operand type and determine a weight value. 12004/// This object must already have been set up with the operand type 12005/// and the current alternative constraint selected. 12006TargetLowering::ConstraintWeight 12007 X86TargetLowering::getSingleConstraintMatchWeight( 12008 AsmOperandInfo &info, const char *constraint) const { 12009 ConstraintWeight weight = CW_Invalid; 12010 Value *CallOperandVal = info.CallOperandVal; 12011 // If we don't have a value, we can't do a match, 12012 // but allow it at the lowest weight. 12013 if (CallOperandVal == NULL) 12014 return CW_Default; 12015 const Type *type = CallOperandVal->getType(); 12016 // Look at the constraint type. 12017 switch (*constraint) { 12018 default: 12019 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 12020 case 'R': 12021 case 'q': 12022 case 'Q': 12023 case 'a': 12024 case 'b': 12025 case 'c': 12026 case 'd': 12027 case 'S': 12028 case 'D': 12029 case 'A': 12030 if (CallOperandVal->getType()->isIntegerTy()) 12031 weight = CW_SpecificReg; 12032 break; 12033 case 'f': 12034 case 't': 12035 case 'u': 12036 if (type->isFloatingPointTy()) 12037 weight = CW_SpecificReg; 12038 break; 12039 case 'y': 12040 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 12041 weight = CW_SpecificReg; 12042 break; 12043 case 'x': 12044 case 'Y': 12045 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 12046 weight = CW_Register; 12047 break; 12048 case 'I': 12049 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 12050 if (C->getZExtValue() <= 31) 12051 weight = CW_Constant; 12052 } 12053 break; 12054 case 'J': 12055 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12056 if (C->getZExtValue() <= 63) 12057 weight = CW_Constant; 12058 } 12059 break; 12060 case 'K': 12061 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12062 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 12063 weight = CW_Constant; 12064 } 12065 break; 12066 case 'L': 12067 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12068 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 12069 weight = CW_Constant; 12070 } 12071 break; 12072 case 'M': 12073 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12074 if (C->getZExtValue() <= 3) 12075 weight = CW_Constant; 12076 } 12077 break; 12078 case 'N': 12079 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12080 if (C->getZExtValue() <= 0xff) 12081 weight = CW_Constant; 12082 } 12083 break; 12084 case 'G': 12085 case 'C': 12086 if (dyn_cast<ConstantFP>(CallOperandVal)) { 12087 weight = CW_Constant; 12088 } 12089 break; 12090 case 'e': 12091 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12092 if ((C->getSExtValue() >= -0x80000000LL) && 12093 (C->getSExtValue() <= 0x7fffffffLL)) 12094 weight = CW_Constant; 12095 } 12096 break; 12097 case 'Z': 12098 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 12099 if (C->getZExtValue() <= 0xffffffff) 12100 weight = CW_Constant; 12101 } 12102 break; 12103 } 12104 return weight; 12105} 12106 12107/// LowerXConstraint - try to replace an X constraint, which matches anything, 12108/// with another that has more specific requirements based on the type of the 12109/// corresponding operand. 12110const char *X86TargetLowering:: 12111LowerXConstraint(EVT ConstraintVT) const { 12112 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 12113 // 'f' like normal targets. 12114 if (ConstraintVT.isFloatingPoint()) { 12115 if (Subtarget->hasXMMInt()) 12116 return "Y"; 12117 if (Subtarget->hasXMM()) 12118 return "x"; 12119 } 12120 12121 return TargetLowering::LowerXConstraint(ConstraintVT); 12122} 12123 12124/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 12125/// vector. If it is invalid, don't add anything to Ops. 12126void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 12127 char Constraint, 12128 std::vector<SDValue>&Ops, 12129 SelectionDAG &DAG) const { 12130 SDValue Result(0, 0); 12131 12132 switch (Constraint) { 12133 default: break; 12134 case 'I': 12135 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12136 if (C->getZExtValue() <= 31) { 12137 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12138 break; 12139 } 12140 } 12141 return; 12142 case 'J': 12143 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12144 if (C->getZExtValue() <= 63) { 12145 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12146 break; 12147 } 12148 } 12149 return; 12150 case 'K': 12151 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12152 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 12153 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12154 break; 12155 } 12156 } 12157 return; 12158 case 'N': 12159 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12160 if (C->getZExtValue() <= 255) { 12161 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12162 break; 12163 } 12164 } 12165 return; 12166 case 'e': { 12167 // 32-bit signed value 12168 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12169 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 12170 C->getSExtValue())) { 12171 // Widen to 64 bits here to get it sign extended. 12172 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 12173 break; 12174 } 12175 // FIXME gcc accepts some relocatable values here too, but only in certain 12176 // memory models; it's complicated. 12177 } 12178 return; 12179 } 12180 case 'Z': { 12181 // 32-bit unsigned value 12182 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 12183 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 12184 C->getZExtValue())) { 12185 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 12186 break; 12187 } 12188 } 12189 // FIXME gcc accepts some relocatable values here too, but only in certain 12190 // memory models; it's complicated. 12191 return; 12192 } 12193 case 'i': { 12194 // Literal immediates are always ok. 12195 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 12196 // Widen to 64 bits here to get it sign extended. 12197 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 12198 break; 12199 } 12200 12201 // In any sort of PIC mode addresses need to be computed at runtime by 12202 // adding in a register or some sort of table lookup. These can't 12203 // be used as immediates. 12204 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 12205 return; 12206 12207 // If we are in non-pic codegen mode, we allow the address of a global (with 12208 // an optional displacement) to be used with 'i'. 12209 GlobalAddressSDNode *GA = 0; 12210 int64_t Offset = 0; 12211 12212 // Match either (GA), (GA+C), (GA+C1+C2), etc. 12213 while (1) { 12214 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 12215 Offset += GA->getOffset(); 12216 break; 12217 } else if (Op.getOpcode() == ISD::ADD) { 12218 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 12219 Offset += C->getZExtValue(); 12220 Op = Op.getOperand(0); 12221 continue; 12222 } 12223 } else if (Op.getOpcode() == ISD::SUB) { 12224 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 12225 Offset += -C->getZExtValue(); 12226 Op = Op.getOperand(0); 12227 continue; 12228 } 12229 } 12230 12231 // Otherwise, this isn't something we can handle, reject it. 12232 return; 12233 } 12234 12235 const GlobalValue *GV = GA->getGlobal(); 12236 // If we require an extra load to get this address, as in PIC mode, we 12237 // can't accept it. 12238 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 12239 getTargetMachine()))) 12240 return; 12241 12242 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 12243 GA->getValueType(0), Offset); 12244 break; 12245 } 12246 } 12247 12248 if (Result.getNode()) { 12249 Ops.push_back(Result); 12250 return; 12251 } 12252 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 12253} 12254 12255std::vector<unsigned> X86TargetLowering:: 12256getRegClassForInlineAsmConstraint(const std::string &Constraint, 12257 EVT VT) const { 12258 if (Constraint.size() == 1) { 12259 // FIXME: not handling fp-stack yet! 12260 switch (Constraint[0]) { // GCC X86 Constraint Letters 12261 default: break; // Unknown constraint letter 12262 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 12263 if (Subtarget->is64Bit()) { 12264 if (VT == MVT::i32) 12265 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 12266 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 12267 X86::R10D,X86::R11D,X86::R12D, 12268 X86::R13D,X86::R14D,X86::R15D, 12269 X86::EBP, X86::ESP, 0); 12270 else if (VT == MVT::i16) 12271 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 12272 X86::SI, X86::DI, X86::R8W,X86::R9W, 12273 X86::R10W,X86::R11W,X86::R12W, 12274 X86::R13W,X86::R14W,X86::R15W, 12275 X86::BP, X86::SP, 0); 12276 else if (VT == MVT::i8) 12277 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 12278 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 12279 X86::R10B,X86::R11B,X86::R12B, 12280 X86::R13B,X86::R14B,X86::R15B, 12281 X86::BPL, X86::SPL, 0); 12282 12283 else if (VT == MVT::i64) 12284 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 12285 X86::RSI, X86::RDI, X86::R8, X86::R9, 12286 X86::R10, X86::R11, X86::R12, 12287 X86::R13, X86::R14, X86::R15, 12288 X86::RBP, X86::RSP, 0); 12289 12290 break; 12291 } 12292 // 32-bit fallthrough 12293 case 'Q': // Q_REGS 12294 if (VT == MVT::i32) 12295 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 12296 else if (VT == MVT::i16) 12297 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 12298 else if (VT == MVT::i8) 12299 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 12300 else if (VT == MVT::i64) 12301 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 12302 break; 12303 } 12304 } 12305 12306 return std::vector<unsigned>(); 12307} 12308 12309std::pair<unsigned, const TargetRegisterClass*> 12310X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 12311 EVT VT) const { 12312 // First, see if this is a constraint that directly corresponds to an LLVM 12313 // register class. 12314 if (Constraint.size() == 1) { 12315 // GCC Constraint Letters 12316 switch (Constraint[0]) { 12317 default: break; 12318 case 'r': // GENERAL_REGS 12319 case 'l': // INDEX_REGS 12320 if (VT == MVT::i8) 12321 return std::make_pair(0U, X86::GR8RegisterClass); 12322 if (VT == MVT::i16) 12323 return std::make_pair(0U, X86::GR16RegisterClass); 12324 if (VT == MVT::i32 || !Subtarget->is64Bit()) 12325 return std::make_pair(0U, X86::GR32RegisterClass); 12326 return std::make_pair(0U, X86::GR64RegisterClass); 12327 case 'R': // LEGACY_REGS 12328 if (VT == MVT::i8) 12329 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 12330 if (VT == MVT::i16) 12331 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 12332 if (VT == MVT::i32 || !Subtarget->is64Bit()) 12333 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 12334 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 12335 case 'f': // FP Stack registers. 12336 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 12337 // value to the correct fpstack register class. 12338 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 12339 return std::make_pair(0U, X86::RFP32RegisterClass); 12340 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 12341 return std::make_pair(0U, X86::RFP64RegisterClass); 12342 return std::make_pair(0U, X86::RFP80RegisterClass); 12343 case 'y': // MMX_REGS if MMX allowed. 12344 if (!Subtarget->hasMMX()) break; 12345 return std::make_pair(0U, X86::VR64RegisterClass); 12346 case 'Y': // SSE_REGS if SSE2 allowed 12347 if (!Subtarget->hasXMMInt()) break; 12348 // FALL THROUGH. 12349 case 'x': // SSE_REGS if SSE1 allowed 12350 if (!Subtarget->hasXMM()) break; 12351 12352 switch (VT.getSimpleVT().SimpleTy) { 12353 default: break; 12354 // Scalar SSE types. 12355 case MVT::f32: 12356 case MVT::i32: 12357 return std::make_pair(0U, X86::FR32RegisterClass); 12358 case MVT::f64: 12359 case MVT::i64: 12360 return std::make_pair(0U, X86::FR64RegisterClass); 12361 // Vector types. 12362 case MVT::v16i8: 12363 case MVT::v8i16: 12364 case MVT::v4i32: 12365 case MVT::v2i64: 12366 case MVT::v4f32: 12367 case MVT::v2f64: 12368 return std::make_pair(0U, X86::VR128RegisterClass); 12369 } 12370 break; 12371 } 12372 } 12373 12374 // Use the default implementation in TargetLowering to convert the register 12375 // constraint into a member of a register class. 12376 std::pair<unsigned, const TargetRegisterClass*> Res; 12377 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 12378 12379 // Not found as a standard register? 12380 if (Res.second == 0) { 12381 // Map st(0) -> st(7) -> ST0 12382 if (Constraint.size() == 7 && Constraint[0] == '{' && 12383 tolower(Constraint[1]) == 's' && 12384 tolower(Constraint[2]) == 't' && 12385 Constraint[3] == '(' && 12386 (Constraint[4] >= '0' && Constraint[4] <= '7') && 12387 Constraint[5] == ')' && 12388 Constraint[6] == '}') { 12389 12390 Res.first = X86::ST0+Constraint[4]-'0'; 12391 Res.second = X86::RFP80RegisterClass; 12392 return Res; 12393 } 12394 12395 // GCC allows "st(0)" to be called just plain "st". 12396 if (StringRef("{st}").equals_lower(Constraint)) { 12397 Res.first = X86::ST0; 12398 Res.second = X86::RFP80RegisterClass; 12399 return Res; 12400 } 12401 12402 // flags -> EFLAGS 12403 if (StringRef("{flags}").equals_lower(Constraint)) { 12404 Res.first = X86::EFLAGS; 12405 Res.second = X86::CCRRegisterClass; 12406 return Res; 12407 } 12408 12409 // 'A' means EAX + EDX. 12410 if (Constraint == "A") { 12411 Res.first = X86::EAX; 12412 Res.second = X86::GR32_ADRegisterClass; 12413 return Res; 12414 } 12415 return Res; 12416 } 12417 12418 // Otherwise, check to see if this is a register class of the wrong value 12419 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 12420 // turn into {ax},{dx}. 12421 if (Res.second->hasType(VT)) 12422 return Res; // Correct type already, nothing to do. 12423 12424 // All of the single-register GCC register classes map their values onto 12425 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 12426 // really want an 8-bit or 32-bit register, map to the appropriate register 12427 // class and return the appropriate register. 12428 if (Res.second == X86::GR16RegisterClass) { 12429 if (VT == MVT::i8) { 12430 unsigned DestReg = 0; 12431 switch (Res.first) { 12432 default: break; 12433 case X86::AX: DestReg = X86::AL; break; 12434 case X86::DX: DestReg = X86::DL; break; 12435 case X86::CX: DestReg = X86::CL; break; 12436 case X86::BX: DestReg = X86::BL; break; 12437 } 12438 if (DestReg) { 12439 Res.first = DestReg; 12440 Res.second = X86::GR8RegisterClass; 12441 } 12442 } else if (VT == MVT::i32) { 12443 unsigned DestReg = 0; 12444 switch (Res.first) { 12445 default: break; 12446 case X86::AX: DestReg = X86::EAX; break; 12447 case X86::DX: DestReg = X86::EDX; break; 12448 case X86::CX: DestReg = X86::ECX; break; 12449 case X86::BX: DestReg = X86::EBX; break; 12450 case X86::SI: DestReg = X86::ESI; break; 12451 case X86::DI: DestReg = X86::EDI; break; 12452 case X86::BP: DestReg = X86::EBP; break; 12453 case X86::SP: DestReg = X86::ESP; break; 12454 } 12455 if (DestReg) { 12456 Res.first = DestReg; 12457 Res.second = X86::GR32RegisterClass; 12458 } 12459 } else if (VT == MVT::i64) { 12460 unsigned DestReg = 0; 12461 switch (Res.first) { 12462 default: break; 12463 case X86::AX: DestReg = X86::RAX; break; 12464 case X86::DX: DestReg = X86::RDX; break; 12465 case X86::CX: DestReg = X86::RCX; break; 12466 case X86::BX: DestReg = X86::RBX; break; 12467 case X86::SI: DestReg = X86::RSI; break; 12468 case X86::DI: DestReg = X86::RDI; break; 12469 case X86::BP: DestReg = X86::RBP; break; 12470 case X86::SP: DestReg = X86::RSP; break; 12471 } 12472 if (DestReg) { 12473 Res.first = DestReg; 12474 Res.second = X86::GR64RegisterClass; 12475 } 12476 } 12477 } else if (Res.second == X86::FR32RegisterClass || 12478 Res.second == X86::FR64RegisterClass || 12479 Res.second == X86::VR128RegisterClass) { 12480 // Handle references to XMM physical registers that got mapped into the 12481 // wrong class. This can happen with constraints like {xmm0} where the 12482 // target independent register mapper will just pick the first match it can 12483 // find, ignoring the required type. 12484 if (VT == MVT::f32) 12485 Res.second = X86::FR32RegisterClass; 12486 else if (VT == MVT::f64) 12487 Res.second = X86::FR64RegisterClass; 12488 else if (X86::VR128RegisterClass->hasType(VT)) 12489 Res.second = X86::VR128RegisterClass; 12490 } 12491 12492 return Res; 12493} 12494