X86ISelLowering.cpp revision a0fd0d5b27257c7397e9f6e4234cd600683c9248
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86ShuffleDecode.h" 20#include "X86TargetMachine.h" 21#include "X86TargetObjectFile.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/MachineFrameInfo.h" 32#include "llvm/CodeGen/MachineFunction.h" 33#include "llvm/CodeGen/MachineInstrBuilder.h" 34#include "llvm/CodeGen/MachineJumpTableInfo.h" 35#include "llvm/CodeGen/MachineModuleInfo.h" 36#include "llvm/CodeGen/MachineRegisterInfo.h" 37#include "llvm/CodeGen/PseudoSourceValue.h" 38#include "llvm/MC/MCAsmInfo.h" 39#include "llvm/MC/MCContext.h" 40#include "llvm/MC/MCExpr.h" 41#include "llvm/MC/MCSymbol.h" 42#include "llvm/ADT/BitVector.h" 43#include "llvm/ADT/SmallSet.h" 44#include "llvm/ADT/Statistic.h" 45#include "llvm/ADT/StringExtras.h" 46#include "llvm/ADT/VectorExtras.h" 47#include "llvm/Support/CommandLine.h" 48#include "llvm/Support/Debug.h" 49#include "llvm/Support/Dwarf.h" 50#include "llvm/Support/ErrorHandling.h" 51#include "llvm/Support/MathExtras.h" 52#include "llvm/Support/raw_ostream.h" 53using namespace llvm; 54using namespace dwarf; 55 56STATISTIC(NumTailCalls, "Number of tail calls"); 57 58// Forward declarations. 59static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 60 SDValue V2); 61 62static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 63 64 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 65 66 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 67 if (is64Bit) 68 return new X8664_MachoTargetObjectFile(); 69 return new TargetLoweringObjectFileMachO(); 70 } 71 72 if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 73 if (is64Bit) 74 return new X8664_ELFTargetObjectFile(TM); 75 return new X8632_ELFTargetObjectFile(TM); 76 } 77 if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) 78 return new TargetLoweringObjectFileCOFF(); 79 llvm_unreachable("unknown subtarget type"); 80} 81 82X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 83 : TargetLowering(TM, createTLOF(TM)) { 84 Subtarget = &TM.getSubtarget<X86Subtarget>(); 85 X86ScalarSSEf64 = Subtarget->hasXMMInt(); 86 X86ScalarSSEf32 = Subtarget->hasXMM(); 87 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 88 89 RegInfo = TM.getRegisterInfo(); 90 TD = getTargetData(); 91 92 // Set up the TargetLowering object. 93 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 94 95 // X86 is weird, it always uses i8 for shift amounts and setcc results. 96 setShiftAmountType(MVT::i8); 97 setBooleanContents(ZeroOrOneBooleanContent); 98 setSchedulingPreference(Sched::RegPressure); 99 setStackPointerRegisterToSaveRestore(X86StackPtr); 100 101 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 102 // Setup Windows compiler runtime calls. 103 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 104 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 105 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 106 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 107 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 108 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 109 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 110 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 111 } 112 113 if (Subtarget->isTargetDarwin()) { 114 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 115 setUseUnderscoreSetJmp(false); 116 setUseUnderscoreLongJmp(false); 117 } else if (Subtarget->isTargetMingw()) { 118 // MS runtime is weird: it exports _setjmp, but longjmp! 119 setUseUnderscoreSetJmp(true); 120 setUseUnderscoreLongJmp(false); 121 } else { 122 setUseUnderscoreSetJmp(true); 123 setUseUnderscoreLongJmp(true); 124 } 125 126 // Set up the register classes. 127 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 128 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 129 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 130 if (Subtarget->is64Bit()) 131 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 132 133 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 134 135 // We don't accept any truncstore of integer registers. 136 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 137 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 138 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 139 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 140 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 141 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 142 143 // SETOEQ and SETUNE require checking two conditions. 144 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 145 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 146 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 147 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 148 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 149 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 150 151 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 152 // operation. 153 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 154 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 155 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 156 157 if (Subtarget->is64Bit()) { 158 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 159 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 160 } else if (!UseSoftFloat) { 161 // We have an algorithm for SSE2->double, and we turn this into a 162 // 64-bit FILD followed by conditional FADD for other targets. 163 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 164 // We have an algorithm for SSE2, and we turn this into a 64-bit 165 // FILD for other targets. 166 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 167 } 168 169 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 170 // this operation. 171 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 172 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 173 174 if (!UseSoftFloat) { 175 // SSE has no i16 to fp conversion, only i32 176 if (X86ScalarSSEf32) { 177 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 178 // f32 and f64 cases are Legal, f80 case is not 179 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 180 } else { 181 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 182 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 183 } 184 } else { 185 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 186 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 187 } 188 189 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 190 // are Legal, f80 is custom lowered. 191 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 192 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 193 194 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 195 // this operation. 196 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 197 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 198 199 if (X86ScalarSSEf32) { 200 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 201 // f32 and f64 cases are Legal, f80 case is not 202 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 203 } else { 204 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 205 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 206 } 207 208 // Handle FP_TO_UINT by promoting the destination to a larger signed 209 // conversion. 210 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 211 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 212 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 213 214 if (Subtarget->is64Bit()) { 215 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 216 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 217 } else if (!UseSoftFloat) { 218 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 219 // Expand FP_TO_UINT into a select. 220 // FIXME: We would like to use a Custom expander here eventually to do 221 // the optimal thing for SSE vs. the default expansion in the legalizer. 222 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 223 else 224 // With SSE3 we can use fisttpll to convert to a signed i64; without 225 // SSE, we're stuck with a fistpll. 226 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 227 } 228 229 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 230 if (!X86ScalarSSEf64) { 231 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 232 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 233 if (Subtarget->is64Bit()) { 234 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 235 // Without SSE, i64->f64 goes through memory. 236 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 237 } 238 } 239 240 // Scalar integer divide and remainder are lowered to use operations that 241 // produce two results, to match the available instructions. This exposes 242 // the two-result form to trivial CSE, which is able to combine x/y and x%y 243 // into a single instruction. 244 // 245 // Scalar integer multiply-high is also lowered to use two-result 246 // operations, to match the available instructions. However, plain multiply 247 // (low) operations are left as Legal, as there are single-result 248 // instructions for this in x86. Using the two-result multiply instructions 249 // when both high and low results are needed must be arranged by dagcombine. 250 for (unsigned i = 0, e = 4; i != e; ++i) { 251 MVT VT = IntVTs[i]; 252 setOperationAction(ISD::MULHS, VT, Expand); 253 setOperationAction(ISD::MULHU, VT, Expand); 254 setOperationAction(ISD::SDIV, VT, Expand); 255 setOperationAction(ISD::UDIV, VT, Expand); 256 setOperationAction(ISD::SREM, VT, Expand); 257 setOperationAction(ISD::UREM, VT, Expand); 258 } 259 260 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 261 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 262 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 263 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 264 if (Subtarget->is64Bit()) 265 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 266 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 267 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 268 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 269 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 270 setOperationAction(ISD::FREM , MVT::f32 , Expand); 271 setOperationAction(ISD::FREM , MVT::f64 , Expand); 272 setOperationAction(ISD::FREM , MVT::f80 , Expand); 273 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 274 275 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 276 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 277 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 278 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 279 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 280 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 281 if (Subtarget->is64Bit()) { 282 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 284 } 285 286 if (Subtarget->hasPOPCNT()) { 287 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 288 } else { 289 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 290 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 291 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 292 if (Subtarget->is64Bit()) 293 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 294 } 295 296 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 297 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 298 299 // These should be promoted to a larger select which is supported. 300 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 301 // X86 wants to expand cmov itself. 302 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 303 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 304 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 305 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 306 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 307 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 308 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 309 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 310 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 311 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 312 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 313 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 314 if (Subtarget->is64Bit()) { 315 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 316 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 317 } 318 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 319 320 // Darwin ABI issue. 321 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 322 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 323 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 324 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 325 if (Subtarget->is64Bit()) 326 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 327 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 328 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 329 if (Subtarget->is64Bit()) { 330 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 331 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 332 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 333 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 334 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 335 } 336 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 337 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 338 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 339 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 340 if (Subtarget->is64Bit()) { 341 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 342 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 343 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 344 } 345 346 if (Subtarget->hasXMM()) 347 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 348 349 // We may not have a libcall for MEMBARRIER so we should lower this. 350 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 351 352 // On X86 and X86-64, atomic operations are lowered to locked instructions. 353 // Locked instructions, in turn, have implicit fence semantics (all memory 354 // operations are flushed before issuing the locked instruction, and they 355 // are not buffered), so we can fold away the common pattern of 356 // fence-atomic-fence. 357 setShouldFoldAtomicFences(true); 358 359 // Expand certain atomics 360 for (unsigned i = 0, e = 4; i != e; ++i) { 361 MVT VT = IntVTs[i]; 362 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 363 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 364 } 365 366 if (!Subtarget->is64Bit()) { 367 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 368 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 373 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 374 } 375 376 // FIXME - use subtarget debug flags 377 if (!Subtarget->isTargetDarwin() && 378 !Subtarget->isTargetELF() && 379 !Subtarget->isTargetCygMing()) { 380 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 381 } 382 383 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 384 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 385 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 386 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 387 if (Subtarget->is64Bit()) { 388 setExceptionPointerRegister(X86::RAX); 389 setExceptionSelectorRegister(X86::RDX); 390 } else { 391 setExceptionPointerRegister(X86::EAX); 392 setExceptionSelectorRegister(X86::EDX); 393 } 394 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 396 397 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 398 399 setOperationAction(ISD::TRAP, MVT::Other, Legal); 400 401 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 402 setOperationAction(ISD::VASTART , MVT::Other, Custom); 403 setOperationAction(ISD::VAEND , MVT::Other, Expand); 404 if (Subtarget->is64Bit()) { 405 setOperationAction(ISD::VAARG , MVT::Other, Custom); 406 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 407 } else { 408 setOperationAction(ISD::VAARG , MVT::Other, Expand); 409 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 410 } 411 412 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 413 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 414 if (Subtarget->is64Bit()) 415 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 416 if (Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) 417 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 418 else 419 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 420 421 if (!UseSoftFloat && X86ScalarSSEf64) { 422 // f32 and f64 use SSE. 423 // Set up the FP register classes. 424 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 425 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 426 427 // Use ANDPD to simulate FABS. 428 setOperationAction(ISD::FABS , MVT::f64, Custom); 429 setOperationAction(ISD::FABS , MVT::f32, Custom); 430 431 // Use XORP to simulate FNEG. 432 setOperationAction(ISD::FNEG , MVT::f64, Custom); 433 setOperationAction(ISD::FNEG , MVT::f32, Custom); 434 435 // Use ANDPD and ORPD to simulate FCOPYSIGN. 436 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 437 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 438 439 // We don't support sin/cos/fmod 440 setOperationAction(ISD::FSIN , MVT::f64, Expand); 441 setOperationAction(ISD::FCOS , MVT::f64, Expand); 442 setOperationAction(ISD::FSIN , MVT::f32, Expand); 443 setOperationAction(ISD::FCOS , MVT::f32, Expand); 444 445 // Expand FP immediates into loads from the stack, except for the special 446 // cases we handle. 447 addLegalFPImmediate(APFloat(+0.0)); // xorpd 448 addLegalFPImmediate(APFloat(+0.0f)); // xorps 449 } else if (!UseSoftFloat && X86ScalarSSEf32) { 450 // Use SSE for f32, x87 for f64. 451 // Set up the FP register classes. 452 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 453 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 454 455 // Use ANDPS to simulate FABS. 456 setOperationAction(ISD::FABS , MVT::f32, Custom); 457 458 // Use XORP to simulate FNEG. 459 setOperationAction(ISD::FNEG , MVT::f32, Custom); 460 461 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 462 463 // Use ANDPS and ORPS to simulate FCOPYSIGN. 464 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 465 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 466 467 // We don't support sin/cos/fmod 468 setOperationAction(ISD::FSIN , MVT::f32, Expand); 469 setOperationAction(ISD::FCOS , MVT::f32, Expand); 470 471 // Special cases we handle for FP constants. 472 addLegalFPImmediate(APFloat(+0.0f)); // xorps 473 addLegalFPImmediate(APFloat(+0.0)); // FLD0 474 addLegalFPImmediate(APFloat(+1.0)); // FLD1 475 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 476 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 477 478 if (!UnsafeFPMath) { 479 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 480 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 481 } 482 } else if (!UseSoftFloat) { 483 // f32 and f64 in x87. 484 // Set up the FP register classes. 485 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 486 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 487 488 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 489 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 490 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 492 493 if (!UnsafeFPMath) { 494 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 495 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 496 } 497 addLegalFPImmediate(APFloat(+0.0)); // FLD0 498 addLegalFPImmediate(APFloat(+1.0)); // FLD1 499 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 500 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 501 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 502 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 503 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 504 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 505 } 506 507 // Long double always uses X87. 508 if (!UseSoftFloat) { 509 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 510 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 511 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 512 { 513 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 514 addLegalFPImmediate(TmpFlt); // FLD0 515 TmpFlt.changeSign(); 516 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 517 518 bool ignored; 519 APFloat TmpFlt2(+1.0); 520 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 521 &ignored); 522 addLegalFPImmediate(TmpFlt2); // FLD1 523 TmpFlt2.changeSign(); 524 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 525 } 526 527 if (!UnsafeFPMath) { 528 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 529 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 530 } 531 } 532 533 // Always use a library call for pow. 534 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 535 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 536 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 537 538 setOperationAction(ISD::FLOG, MVT::f80, Expand); 539 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 540 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 541 setOperationAction(ISD::FEXP, MVT::f80, Expand); 542 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 543 544 // First set operation action for all vector types to either promote 545 // (for widening) or expand (for scalarization). Then we will selectively 546 // turn on ones that can be effectively codegen'd. 547 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 548 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 549 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 550 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 551 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 564 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 565 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 598 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 602 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 603 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 604 setTruncStoreAction((MVT::SimpleValueType)VT, 605 (MVT::SimpleValueType)InnerVT, Expand); 606 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 607 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 608 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 609 } 610 611 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 612 // with -msoft-float, disable use of MMX as well. 613 if (!UseSoftFloat && Subtarget->hasMMX()) { 614 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 615 // No operations on x86mmx supported, everything uses intrinsics. 616 } 617 618 // MMX-sized vectors (other than x86mmx) are expected to be expanded 619 // into smaller operations. 620 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 621 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 622 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 623 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 624 setOperationAction(ISD::AND, MVT::v8i8, Expand); 625 setOperationAction(ISD::AND, MVT::v4i16, Expand); 626 setOperationAction(ISD::AND, MVT::v2i32, Expand); 627 setOperationAction(ISD::AND, MVT::v1i64, Expand); 628 setOperationAction(ISD::OR, MVT::v8i8, Expand); 629 setOperationAction(ISD::OR, MVT::v4i16, Expand); 630 setOperationAction(ISD::OR, MVT::v2i32, Expand); 631 setOperationAction(ISD::OR, MVT::v1i64, Expand); 632 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 633 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 634 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 635 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 636 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 637 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 638 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 639 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 640 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 641 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 642 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 643 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 644 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 645 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 646 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 647 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 648 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 649 650 if (!UseSoftFloat && Subtarget->hasXMM()) { 651 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 652 653 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 654 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 655 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 656 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 657 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 658 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 659 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 660 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 661 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 662 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 663 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 664 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 665 } 666 667 if (!UseSoftFloat && Subtarget->hasXMMInt()) { 668 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 669 670 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 671 // registers cannot be used even for integer operations. 672 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 673 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 674 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 675 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 676 677 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 678 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 679 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 680 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 681 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 682 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 683 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 684 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 685 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 686 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 687 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 688 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 689 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 690 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 691 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 692 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 693 694 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 695 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 696 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 697 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 698 699 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 700 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 701 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 702 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 703 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 704 705 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 706 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 707 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 708 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 709 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 710 711 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 712 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 713 EVT VT = (MVT::SimpleValueType)i; 714 // Do not attempt to custom lower non-power-of-2 vectors 715 if (!isPowerOf2_32(VT.getVectorNumElements())) 716 continue; 717 // Do not attempt to custom lower non-128-bit vectors 718 if (!VT.is128BitVector()) 719 continue; 720 setOperationAction(ISD::BUILD_VECTOR, 721 VT.getSimpleVT().SimpleTy, Custom); 722 setOperationAction(ISD::VECTOR_SHUFFLE, 723 VT.getSimpleVT().SimpleTy, Custom); 724 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 725 VT.getSimpleVT().SimpleTy, Custom); 726 } 727 728 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 729 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 730 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 731 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 732 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 733 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 734 735 if (Subtarget->is64Bit()) { 736 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 737 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 738 } 739 740 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 741 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 742 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 743 EVT VT = SVT; 744 745 // Do not attempt to promote non-128-bit vectors 746 if (!VT.is128BitVector()) 747 continue; 748 749 setOperationAction(ISD::AND, SVT, Promote); 750 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 751 setOperationAction(ISD::OR, SVT, Promote); 752 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 753 setOperationAction(ISD::XOR, SVT, Promote); 754 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 755 setOperationAction(ISD::LOAD, SVT, Promote); 756 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 757 setOperationAction(ISD::SELECT, SVT, Promote); 758 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 759 } 760 761 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 762 763 // Custom lower v2i64 and v2f64 selects. 764 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 765 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 766 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 767 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 768 769 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 770 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 771 } 772 773 if (Subtarget->hasSSE41()) { 774 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 775 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 776 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 777 setOperationAction(ISD::FRINT, MVT::f32, Legal); 778 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 779 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 780 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 781 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 782 setOperationAction(ISD::FRINT, MVT::f64, Legal); 783 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 784 785 // FIXME: Do we need to handle scalar-to-vector here? 786 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 787 788 // Can turn SHL into an integer multiply. 789 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 790 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 791 792 // i8 and i16 vectors are custom , because the source register and source 793 // source memory operand types are not the same width. f32 vectors are 794 // custom since the immediate controlling the insert encodes additional 795 // information. 796 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 797 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 798 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 799 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 800 801 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 802 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 803 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 804 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 805 806 if (Subtarget->is64Bit()) { 807 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 808 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 809 } 810 } 811 812 if (Subtarget->hasSSE42()) 813 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 814 815 if (!UseSoftFloat && Subtarget->hasAVX()) { 816 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 817 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 818 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 819 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 820 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 821 822 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 823 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 824 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 825 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 826 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 827 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 828 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 829 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 830 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 831 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 832 setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 833 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 834 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 835 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 836 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 837 838 // Operations to consider commented out -v16i16 v32i8 839 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 840 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 841 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 842 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 843 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 844 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 845 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 846 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 847 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 848 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 849 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 850 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 851 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 852 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 853 854 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 855 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 856 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 857 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 858 859 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 860 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 861 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 862 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 863 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 864 865 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 866 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 867 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 868 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 869 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 870 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 871 872#if 0 873 // Not sure we want to do this since there are no 256-bit integer 874 // operations in AVX 875 876 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 877 // This includes 256-bit vectors 878 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 879 EVT VT = (MVT::SimpleValueType)i; 880 881 // Do not attempt to custom lower non-power-of-2 vectors 882 if (!isPowerOf2_32(VT.getVectorNumElements())) 883 continue; 884 885 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 886 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 887 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 888 } 889 890 if (Subtarget->is64Bit()) { 891 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 892 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 893 } 894#endif 895 896#if 0 897 // Not sure we want to do this since there are no 256-bit integer 898 // operations in AVX 899 900 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 901 // Including 256-bit vectors 902 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 903 EVT VT = (MVT::SimpleValueType)i; 904 905 if (!VT.is256BitVector()) { 906 continue; 907 } 908 setOperationAction(ISD::AND, VT, Promote); 909 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 910 setOperationAction(ISD::OR, VT, Promote); 911 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 912 setOperationAction(ISD::XOR, VT, Promote); 913 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 914 setOperationAction(ISD::LOAD, VT, Promote); 915 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 916 setOperationAction(ISD::SELECT, VT, Promote); 917 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 918 } 919 920 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 921#endif 922 } 923 924 // We want to custom lower some of our intrinsics. 925 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 926 927 928 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 929 // handle type legalization for these operations here. 930 // 931 // FIXME: We really should do custom legalization for addition and 932 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 933 // than generic legalization for 64-bit multiplication-with-overflow, though. 934 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 935 // Add/Sub/Mul with overflow operations are custom lowered. 936 MVT VT = IntVTs[i]; 937 setOperationAction(ISD::SADDO, VT, Custom); 938 setOperationAction(ISD::UADDO, VT, Custom); 939 setOperationAction(ISD::SSUBO, VT, Custom); 940 setOperationAction(ISD::USUBO, VT, Custom); 941 setOperationAction(ISD::SMULO, VT, Custom); 942 setOperationAction(ISD::UMULO, VT, Custom); 943 } 944 945 // There are no 8-bit 3-address imul/mul instructions 946 setOperationAction(ISD::SMULO, MVT::i8, Expand); 947 setOperationAction(ISD::UMULO, MVT::i8, Expand); 948 949 if (!Subtarget->is64Bit()) { 950 // These libcalls are not available in 32-bit. 951 setLibcallName(RTLIB::SHL_I128, 0); 952 setLibcallName(RTLIB::SRL_I128, 0); 953 setLibcallName(RTLIB::SRA_I128, 0); 954 } 955 956 // We have target-specific dag combine patterns for the following nodes: 957 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 958 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 959 setTargetDAGCombine(ISD::BUILD_VECTOR); 960 setTargetDAGCombine(ISD::SELECT); 961 setTargetDAGCombine(ISD::SHL); 962 setTargetDAGCombine(ISD::SRA); 963 setTargetDAGCombine(ISD::SRL); 964 setTargetDAGCombine(ISD::OR); 965 setTargetDAGCombine(ISD::AND); 966 setTargetDAGCombine(ISD::STORE); 967 setTargetDAGCombine(ISD::ZERO_EXTEND); 968 if (Subtarget->is64Bit()) 969 setTargetDAGCombine(ISD::MUL); 970 971 computeRegisterProperties(); 972 973 // FIXME: These should be based on subtarget info. Plus, the values should 974 // be smaller when we are in optimizing for size mode. 975 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 976 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 977 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 978 setPrefLoopAlignment(16); 979 benefitFromCodePlacementOpt = true; 980} 981 982 983MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 984 return MVT::i8; 985} 986 987 988/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 989/// the desired ByVal argument alignment. 990static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 991 if (MaxAlign == 16) 992 return; 993 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 994 if (VTy->getBitWidth() == 128) 995 MaxAlign = 16; 996 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 997 unsigned EltAlign = 0; 998 getMaxByValAlign(ATy->getElementType(), EltAlign); 999 if (EltAlign > MaxAlign) 1000 MaxAlign = EltAlign; 1001 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1002 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1003 unsigned EltAlign = 0; 1004 getMaxByValAlign(STy->getElementType(i), EltAlign); 1005 if (EltAlign > MaxAlign) 1006 MaxAlign = EltAlign; 1007 if (MaxAlign == 16) 1008 break; 1009 } 1010 } 1011 return; 1012} 1013 1014/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1015/// function arguments in the caller parameter area. For X86, aggregates 1016/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1017/// are at 4-byte boundaries. 1018unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1019 if (Subtarget->is64Bit()) { 1020 // Max of 8 and alignment of type. 1021 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1022 if (TyAlign > 8) 1023 return TyAlign; 1024 return 8; 1025 } 1026 1027 unsigned Align = 4; 1028 if (Subtarget->hasXMM()) 1029 getMaxByValAlign(Ty, Align); 1030 return Align; 1031} 1032 1033/// getOptimalMemOpType - Returns the target specific optimal type for load 1034/// and store operations as a result of memset, memcpy, and memmove 1035/// lowering. If DstAlign is zero that means it's safe to destination 1036/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1037/// means there isn't a need to check it against alignment requirement, 1038/// probably because the source does not need to be loaded. If 1039/// 'NonScalarIntSafe' is true, that means it's safe to return a 1040/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1041/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1042/// constant so it does not need to be loaded. 1043/// It returns EVT::Other if the type should be determined using generic 1044/// target-independent logic. 1045EVT 1046X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1047 unsigned DstAlign, unsigned SrcAlign, 1048 bool NonScalarIntSafe, 1049 bool MemcpyStrSrc, 1050 MachineFunction &MF) const { 1051 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1052 // linux. This is because the stack realignment code can't handle certain 1053 // cases like PR2962. This should be removed when PR2962 is fixed. 1054 const Function *F = MF.getFunction(); 1055 if (NonScalarIntSafe && 1056 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1057 if (Size >= 16 && 1058 (Subtarget->isUnalignedMemAccessFast() || 1059 ((DstAlign == 0 || DstAlign >= 16) && 1060 (SrcAlign == 0 || SrcAlign >= 16))) && 1061 Subtarget->getStackAlignment() >= 16) { 1062 if (Subtarget->hasSSE2()) 1063 return MVT::v4i32; 1064 if (Subtarget->hasSSE1()) 1065 return MVT::v4f32; 1066 } else if (!MemcpyStrSrc && Size >= 8 && 1067 !Subtarget->is64Bit() && 1068 Subtarget->getStackAlignment() >= 8 && 1069 Subtarget->hasXMMInt()) { 1070 // Do not use f64 to lower memcpy if source is string constant. It's 1071 // better to use i32 to avoid the loads. 1072 return MVT::f64; 1073 } 1074 } 1075 if (Subtarget->is64Bit() && Size >= 8) 1076 return MVT::i64; 1077 return MVT::i32; 1078} 1079 1080/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1081/// current function. The returned value is a member of the 1082/// MachineJumpTableInfo::JTEntryKind enum. 1083unsigned X86TargetLowering::getJumpTableEncoding() const { 1084 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1085 // symbol. 1086 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1087 Subtarget->isPICStyleGOT()) 1088 return MachineJumpTableInfo::EK_Custom32; 1089 1090 // Otherwise, use the normal jump table encoding heuristics. 1091 return TargetLowering::getJumpTableEncoding(); 1092} 1093 1094const MCExpr * 1095X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1096 const MachineBasicBlock *MBB, 1097 unsigned uid,MCContext &Ctx) const{ 1098 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1099 Subtarget->isPICStyleGOT()); 1100 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1101 // entries. 1102 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1103 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1104} 1105 1106/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1107/// jumptable. 1108SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1109 SelectionDAG &DAG) const { 1110 if (!Subtarget->is64Bit()) 1111 // This doesn't have DebugLoc associated with it, but is not really the 1112 // same as a Register. 1113 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1114 return Table; 1115} 1116 1117/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1118/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1119/// MCExpr. 1120const MCExpr *X86TargetLowering:: 1121getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1122 MCContext &Ctx) const { 1123 // X86-64 uses RIP relative addressing based on the jump table label. 1124 if (Subtarget->isPICStyleRIPRel()) 1125 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1126 1127 // Otherwise, the reference is relative to the PIC base. 1128 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1129} 1130 1131/// getFunctionAlignment - Return the Log2 alignment of this function. 1132unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1133 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1134} 1135 1136std::pair<const TargetRegisterClass*, uint8_t> 1137X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1138 const TargetRegisterClass *RRC = 0; 1139 uint8_t Cost = 1; 1140 switch (VT.getSimpleVT().SimpleTy) { 1141 default: 1142 return TargetLowering::findRepresentativeClass(VT); 1143 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1144 RRC = (Subtarget->is64Bit() 1145 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1146 break; 1147 case MVT::x86mmx: 1148 RRC = X86::VR64RegisterClass; 1149 break; 1150 case MVT::f32: case MVT::f64: 1151 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1152 case MVT::v4f32: case MVT::v2f64: 1153 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1154 case MVT::v4f64: 1155 RRC = X86::VR128RegisterClass; 1156 break; 1157 } 1158 return std::make_pair(RRC, Cost); 1159} 1160 1161unsigned 1162X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1163 MachineFunction &MF) const { 1164 const TargetFrameInfo *TFI = MF.getTarget().getFrameInfo(); 1165 1166 unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; 1167 switch (RC->getID()) { 1168 default: 1169 return 0; 1170 case X86::GR32RegClassID: 1171 return 4 - FPDiff; 1172 case X86::GR64RegClassID: 1173 return 8 - FPDiff; 1174 case X86::VR128RegClassID: 1175 return Subtarget->is64Bit() ? 10 : 4; 1176 case X86::VR64RegClassID: 1177 return 4; 1178 } 1179} 1180 1181bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1182 unsigned &Offset) const { 1183 if (!Subtarget->isTargetLinux()) 1184 return false; 1185 1186 if (Subtarget->is64Bit()) { 1187 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1188 Offset = 0x28; 1189 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1190 AddressSpace = 256; 1191 else 1192 AddressSpace = 257; 1193 } else { 1194 // %gs:0x14 on i386 1195 Offset = 0x14; 1196 AddressSpace = 256; 1197 } 1198 return true; 1199} 1200 1201 1202//===----------------------------------------------------------------------===// 1203// Return Value Calling Convention Implementation 1204//===----------------------------------------------------------------------===// 1205 1206#include "X86GenCallingConv.inc" 1207 1208bool 1209X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1210 const SmallVectorImpl<ISD::OutputArg> &Outs, 1211 LLVMContext &Context) const { 1212 SmallVector<CCValAssign, 16> RVLocs; 1213 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1214 RVLocs, Context); 1215 return CCInfo.CheckReturn(Outs, RetCC_X86); 1216} 1217 1218SDValue 1219X86TargetLowering::LowerReturn(SDValue Chain, 1220 CallingConv::ID CallConv, bool isVarArg, 1221 const SmallVectorImpl<ISD::OutputArg> &Outs, 1222 const SmallVectorImpl<SDValue> &OutVals, 1223 DebugLoc dl, SelectionDAG &DAG) const { 1224 MachineFunction &MF = DAG.getMachineFunction(); 1225 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1226 1227 SmallVector<CCValAssign, 16> RVLocs; 1228 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1229 RVLocs, *DAG.getContext()); 1230 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1231 1232 // Add the regs to the liveout set for the function. 1233 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1234 for (unsigned i = 0; i != RVLocs.size(); ++i) 1235 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1236 MRI.addLiveOut(RVLocs[i].getLocReg()); 1237 1238 SDValue Flag; 1239 1240 SmallVector<SDValue, 6> RetOps; 1241 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1242 // Operand #1 = Bytes To Pop 1243 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1244 MVT::i16)); 1245 1246 // Copy the result values into the output registers. 1247 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1248 CCValAssign &VA = RVLocs[i]; 1249 assert(VA.isRegLoc() && "Can only return in registers!"); 1250 SDValue ValToCopy = OutVals[i]; 1251 EVT ValVT = ValToCopy.getValueType(); 1252 1253 // If this is x86-64, and we disabled SSE, we can't return FP values, 1254 // or SSE or MMX vectors. 1255 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1256 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1257 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1258 report_fatal_error("SSE register return with SSE disabled"); 1259 } 1260 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1261 // llvm-gcc has never done it right and no one has noticed, so this 1262 // should be OK for now. 1263 if (ValVT == MVT::f64 && 1264 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1265 report_fatal_error("SSE2 register return with SSE2 disabled"); 1266 1267 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1268 // the RET instruction and handled by the FP Stackifier. 1269 if (VA.getLocReg() == X86::ST0 || 1270 VA.getLocReg() == X86::ST1) { 1271 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1272 // change the value to the FP stack register class. 1273 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1274 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1275 RetOps.push_back(ValToCopy); 1276 // Don't emit a copytoreg. 1277 continue; 1278 } 1279 1280 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1281 // which is returned in RAX / RDX. 1282 if (Subtarget->is64Bit()) { 1283 if (ValVT == MVT::x86mmx) { 1284 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1285 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1286 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1287 ValToCopy); 1288 // If we don't have SSE2 available, convert to v4f32 so the generated 1289 // register is legal. 1290 if (!Subtarget->hasSSE2()) 1291 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1292 } 1293 } 1294 } 1295 1296 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1297 Flag = Chain.getValue(1); 1298 } 1299 1300 // The x86-64 ABI for returning structs by value requires that we copy 1301 // the sret argument into %rax for the return. We saved the argument into 1302 // a virtual register in the entry block, so now we copy the value out 1303 // and into %rax. 1304 if (Subtarget->is64Bit() && 1305 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1306 MachineFunction &MF = DAG.getMachineFunction(); 1307 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1308 unsigned Reg = FuncInfo->getSRetReturnReg(); 1309 assert(Reg && 1310 "SRetReturnReg should have been set in LowerFormalArguments()."); 1311 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1312 1313 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1314 Flag = Chain.getValue(1); 1315 1316 // RAX now acts like a return value. 1317 MRI.addLiveOut(X86::RAX); 1318 } 1319 1320 RetOps[0] = Chain; // Update chain. 1321 1322 // Add the flag if we have it. 1323 if (Flag.getNode()) 1324 RetOps.push_back(Flag); 1325 1326 return DAG.getNode(X86ISD::RET_FLAG, dl, 1327 MVT::Other, &RetOps[0], RetOps.size()); 1328} 1329 1330bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1331 if (N->getNumValues() != 1) 1332 return false; 1333 if (!N->hasNUsesOfValue(1, 0)) 1334 return false; 1335 1336 SDNode *Copy = *N->use_begin(); 1337 if (Copy->getOpcode() != ISD::CopyToReg && 1338 Copy->getOpcode() != ISD::FP_EXTEND) 1339 return false; 1340 1341 bool HasRet = false; 1342 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1343 UI != UE; ++UI) { 1344 if (UI->getOpcode() != X86ISD::RET_FLAG) 1345 return false; 1346 HasRet = true; 1347 } 1348 1349 return HasRet; 1350} 1351 1352/// LowerCallResult - Lower the result values of a call into the 1353/// appropriate copies out of appropriate physical registers. 1354/// 1355SDValue 1356X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1357 CallingConv::ID CallConv, bool isVarArg, 1358 const SmallVectorImpl<ISD::InputArg> &Ins, 1359 DebugLoc dl, SelectionDAG &DAG, 1360 SmallVectorImpl<SDValue> &InVals) const { 1361 1362 // Assign locations to each value returned by this call. 1363 SmallVector<CCValAssign, 16> RVLocs; 1364 bool Is64Bit = Subtarget->is64Bit(); 1365 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1366 RVLocs, *DAG.getContext()); 1367 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1368 1369 // Copy all of the result registers out of their specified physreg. 1370 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1371 CCValAssign &VA = RVLocs[i]; 1372 EVT CopyVT = VA.getValVT(); 1373 1374 // If this is x86-64, and we disabled SSE, we can't return FP values 1375 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1376 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1377 report_fatal_error("SSE register return with SSE disabled"); 1378 } 1379 1380 SDValue Val; 1381 1382 // If this is a call to a function that returns an fp value on the floating 1383 // point stack, we must guarantee the the value is popped from the stack, so 1384 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1385 // if the return value is not used. We use the FpGET_ST0 instructions 1386 // instead. 1387 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1388 // If we prefer to use the value in xmm registers, copy it out as f80 and 1389 // use a truncate to move it from fp stack reg to xmm reg. 1390 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1391 bool isST0 = VA.getLocReg() == X86::ST0; 1392 unsigned Opc = 0; 1393 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1394 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1395 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1396 SDValue Ops[] = { Chain, InFlag }; 1397 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1398 Ops, 2), 1); 1399 Val = Chain.getValue(0); 1400 1401 // Round the f80 to the right size, which also moves it to the appropriate 1402 // xmm register. 1403 if (CopyVT != VA.getValVT()) 1404 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1405 // This truncation won't change the value. 1406 DAG.getIntPtrConstant(1)); 1407 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1408 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1409 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1410 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1411 MVT::v2i64, InFlag).getValue(1); 1412 Val = Chain.getValue(0); 1413 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1414 Val, DAG.getConstant(0, MVT::i64)); 1415 } else { 1416 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1417 MVT::i64, InFlag).getValue(1); 1418 Val = Chain.getValue(0); 1419 } 1420 Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val); 1421 } else { 1422 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1423 CopyVT, InFlag).getValue(1); 1424 Val = Chain.getValue(0); 1425 } 1426 InFlag = Chain.getValue(2); 1427 InVals.push_back(Val); 1428 } 1429 1430 return Chain; 1431} 1432 1433 1434//===----------------------------------------------------------------------===// 1435// C & StdCall & Fast Calling Convention implementation 1436//===----------------------------------------------------------------------===// 1437// StdCall calling convention seems to be standard for many Windows' API 1438// routines and around. It differs from C calling convention just a little: 1439// callee should clean up the stack, not caller. Symbols should be also 1440// decorated in some fancy way :) It doesn't support any vector arguments. 1441// For info on fast calling convention see Fast Calling Convention (tail call) 1442// implementation LowerX86_32FastCCCallTo. 1443 1444/// CallIsStructReturn - Determines whether a call uses struct return 1445/// semantics. 1446static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1447 if (Outs.empty()) 1448 return false; 1449 1450 return Outs[0].Flags.isSRet(); 1451} 1452 1453/// ArgsAreStructReturn - Determines whether a function uses struct 1454/// return semantics. 1455static bool 1456ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1457 if (Ins.empty()) 1458 return false; 1459 1460 return Ins[0].Flags.isSRet(); 1461} 1462 1463/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1464/// by "Src" to address "Dst" with size and alignment information specified by 1465/// the specific parameter attribute. The copy will be passed as a byval 1466/// function parameter. 1467static SDValue 1468CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1469 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1470 DebugLoc dl) { 1471 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1472 1473 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1474 /*isVolatile*/false, /*AlwaysInline=*/true, 1475 MachinePointerInfo(), MachinePointerInfo()); 1476} 1477 1478/// IsTailCallConvention - Return true if the calling convention is one that 1479/// supports tail call optimization. 1480static bool IsTailCallConvention(CallingConv::ID CC) { 1481 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1482} 1483 1484/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1485/// a tailcall target by changing its ABI. 1486static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1487 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1488} 1489 1490SDValue 1491X86TargetLowering::LowerMemArgument(SDValue Chain, 1492 CallingConv::ID CallConv, 1493 const SmallVectorImpl<ISD::InputArg> &Ins, 1494 DebugLoc dl, SelectionDAG &DAG, 1495 const CCValAssign &VA, 1496 MachineFrameInfo *MFI, 1497 unsigned i) const { 1498 // Create the nodes corresponding to a load from this parameter slot. 1499 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1500 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1501 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1502 EVT ValVT; 1503 1504 // If value is passed by pointer we have address passed instead of the value 1505 // itself. 1506 if (VA.getLocInfo() == CCValAssign::Indirect) 1507 ValVT = VA.getLocVT(); 1508 else 1509 ValVT = VA.getValVT(); 1510 1511 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1512 // changed with more analysis. 1513 // In case of tail call optimization mark all arguments mutable. Since they 1514 // could be overwritten by lowering of arguments in case of a tail call. 1515 if (Flags.isByVal()) { 1516 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1517 VA.getLocMemOffset(), isImmutable); 1518 return DAG.getFrameIndex(FI, getPointerTy()); 1519 } else { 1520 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1521 VA.getLocMemOffset(), isImmutable); 1522 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1523 return DAG.getLoad(ValVT, dl, Chain, FIN, 1524 MachinePointerInfo::getFixedStack(FI), 1525 false, false, 0); 1526 } 1527} 1528 1529SDValue 1530X86TargetLowering::LowerFormalArguments(SDValue Chain, 1531 CallingConv::ID CallConv, 1532 bool isVarArg, 1533 const SmallVectorImpl<ISD::InputArg> &Ins, 1534 DebugLoc dl, 1535 SelectionDAG &DAG, 1536 SmallVectorImpl<SDValue> &InVals) 1537 const { 1538 MachineFunction &MF = DAG.getMachineFunction(); 1539 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1540 1541 const Function* Fn = MF.getFunction(); 1542 if (Fn->hasExternalLinkage() && 1543 Subtarget->isTargetCygMing() && 1544 Fn->getName() == "main") 1545 FuncInfo->setForceFramePointer(true); 1546 1547 MachineFrameInfo *MFI = MF.getFrameInfo(); 1548 bool Is64Bit = Subtarget->is64Bit(); 1549 bool IsWin64 = Subtarget->isTargetWin64(); 1550 1551 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1552 "Var args not supported with calling convention fastcc or ghc"); 1553 1554 // Assign locations to all of the incoming arguments. 1555 SmallVector<CCValAssign, 16> ArgLocs; 1556 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1557 ArgLocs, *DAG.getContext()); 1558 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1559 1560 unsigned LastVal = ~0U; 1561 SDValue ArgValue; 1562 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1563 CCValAssign &VA = ArgLocs[i]; 1564 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1565 // places. 1566 assert(VA.getValNo() != LastVal && 1567 "Don't support value assigned to multiple locs yet"); 1568 LastVal = VA.getValNo(); 1569 1570 if (VA.isRegLoc()) { 1571 EVT RegVT = VA.getLocVT(); 1572 TargetRegisterClass *RC = NULL; 1573 if (RegVT == MVT::i32) 1574 RC = X86::GR32RegisterClass; 1575 else if (Is64Bit && RegVT == MVT::i64) 1576 RC = X86::GR64RegisterClass; 1577 else if (RegVT == MVT::f32) 1578 RC = X86::FR32RegisterClass; 1579 else if (RegVT == MVT::f64) 1580 RC = X86::FR64RegisterClass; 1581 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1582 RC = X86::VR256RegisterClass; 1583 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1584 RC = X86::VR128RegisterClass; 1585 else if (RegVT == MVT::x86mmx) 1586 RC = X86::VR64RegisterClass; 1587 else 1588 llvm_unreachable("Unknown argument type!"); 1589 1590 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1591 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1592 1593 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1594 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1595 // right size. 1596 if (VA.getLocInfo() == CCValAssign::SExt) 1597 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1598 DAG.getValueType(VA.getValVT())); 1599 else if (VA.getLocInfo() == CCValAssign::ZExt) 1600 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1601 DAG.getValueType(VA.getValVT())); 1602 else if (VA.getLocInfo() == CCValAssign::BCvt) 1603 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1604 1605 if (VA.isExtInLoc()) { 1606 // Handle MMX values passed in XMM regs. 1607 if (RegVT.isVector()) { 1608 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1609 ArgValue); 1610 } else 1611 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1612 } 1613 } else { 1614 assert(VA.isMemLoc()); 1615 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1616 } 1617 1618 // If value is passed via pointer - do a load. 1619 if (VA.getLocInfo() == CCValAssign::Indirect) 1620 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1621 MachinePointerInfo(), false, false, 0); 1622 1623 InVals.push_back(ArgValue); 1624 } 1625 1626 // The x86-64 ABI for returning structs by value requires that we copy 1627 // the sret argument into %rax for the return. Save the argument into 1628 // a virtual register so that we can access it from the return points. 1629 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1630 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1631 unsigned Reg = FuncInfo->getSRetReturnReg(); 1632 if (!Reg) { 1633 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1634 FuncInfo->setSRetReturnReg(Reg); 1635 } 1636 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1637 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1638 } 1639 1640 unsigned StackSize = CCInfo.getNextStackOffset(); 1641 // Align stack specially for tail calls. 1642 if (FuncIsMadeTailCallSafe(CallConv)) 1643 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1644 1645 // If the function takes variable number of arguments, make a frame index for 1646 // the start of the first vararg value... for expansion of llvm.va_start. 1647 if (isVarArg) { 1648 if (!IsWin64 && (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1649 CallConv != CallingConv::X86_ThisCall))) { 1650 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1651 } 1652 if (Is64Bit) { 1653 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1654 1655 // FIXME: We should really autogenerate these arrays 1656 static const unsigned GPR64ArgRegsWin64[] = { 1657 X86::RCX, X86::RDX, X86::R8, X86::R9 1658 }; 1659 static const unsigned GPR64ArgRegs64Bit[] = { 1660 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1661 }; 1662 static const unsigned XMMArgRegs64Bit[] = { 1663 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1664 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1665 }; 1666 const unsigned *GPR64ArgRegs; 1667 unsigned NumXMMRegs = 0; 1668 1669 if (IsWin64) { 1670 // The XMM registers which might contain var arg parameters are shadowed 1671 // in their paired GPR. So we only need to save the GPR to their home 1672 // slots. 1673 TotalNumIntRegs = 4; 1674 GPR64ArgRegs = GPR64ArgRegsWin64; 1675 } else { 1676 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1677 GPR64ArgRegs = GPR64ArgRegs64Bit; 1678 1679 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1680 } 1681 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1682 TotalNumIntRegs); 1683 1684 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1685 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1686 "SSE register cannot be used when SSE is disabled!"); 1687 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1688 "SSE register cannot be used when SSE is disabled!"); 1689 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) 1690 // Kernel mode asks for SSE to be disabled, so don't push them 1691 // on the stack. 1692 TotalNumXMMRegs = 0; 1693 1694 if (IsWin64) { 1695 const TargetFrameInfo &TFI = *getTargetMachine().getFrameInfo(); 1696 // Get to the caller-allocated home save location. Add 8 to account 1697 // for the return address. 1698 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1699 FuncInfo->setRegSaveFrameIndex( 1700 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1701 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1702 } else { 1703 // For X86-64, if there are vararg parameters that are passed via 1704 // registers, then we must store them to their spots on the stack so they 1705 // may be loaded by deferencing the result of va_next. 1706 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1707 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1708 FuncInfo->setRegSaveFrameIndex( 1709 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1710 false)); 1711 } 1712 1713 // Store the integer parameter registers. 1714 SmallVector<SDValue, 8> MemOps; 1715 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1716 getPointerTy()); 1717 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1718 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1719 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1720 DAG.getIntPtrConstant(Offset)); 1721 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1722 X86::GR64RegisterClass); 1723 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1724 SDValue Store = 1725 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1726 MachinePointerInfo::getFixedStack( 1727 FuncInfo->getRegSaveFrameIndex(), Offset), 1728 false, false, 0); 1729 MemOps.push_back(Store); 1730 Offset += 8; 1731 } 1732 1733 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1734 // Now store the XMM (fp + vector) parameter registers. 1735 SmallVector<SDValue, 11> SaveXMMOps; 1736 SaveXMMOps.push_back(Chain); 1737 1738 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1739 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1740 SaveXMMOps.push_back(ALVal); 1741 1742 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1743 FuncInfo->getRegSaveFrameIndex())); 1744 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1745 FuncInfo->getVarArgsFPOffset())); 1746 1747 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1748 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1749 X86::VR128RegisterClass); 1750 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1751 SaveXMMOps.push_back(Val); 1752 } 1753 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1754 MVT::Other, 1755 &SaveXMMOps[0], SaveXMMOps.size())); 1756 } 1757 1758 if (!MemOps.empty()) 1759 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1760 &MemOps[0], MemOps.size()); 1761 } 1762 } 1763 1764 // Some CCs need callee pop. 1765 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1766 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1767 } else { 1768 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1769 // If this is an sret function, the return should pop the hidden pointer. 1770 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1771 FuncInfo->setBytesToPopOnReturn(4); 1772 } 1773 1774 if (!Is64Bit) { 1775 // RegSaveFrameIndex is X86-64 only. 1776 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1777 if (CallConv == CallingConv::X86_FastCall || 1778 CallConv == CallingConv::X86_ThisCall) 1779 // fastcc functions can't have varargs. 1780 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1781 } 1782 1783 return Chain; 1784} 1785 1786SDValue 1787X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1788 SDValue StackPtr, SDValue Arg, 1789 DebugLoc dl, SelectionDAG &DAG, 1790 const CCValAssign &VA, 1791 ISD::ArgFlagsTy Flags) const { 1792 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1793 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1794 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1795 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1796 if (Flags.isByVal()) 1797 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1798 1799 return DAG.getStore(Chain, dl, Arg, PtrOff, 1800 MachinePointerInfo::getStack(LocMemOffset), 1801 false, false, 0); 1802} 1803 1804/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1805/// optimization is performed and it is required. 1806SDValue 1807X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1808 SDValue &OutRetAddr, SDValue Chain, 1809 bool IsTailCall, bool Is64Bit, 1810 int FPDiff, DebugLoc dl) const { 1811 // Adjust the Return address stack slot. 1812 EVT VT = getPointerTy(); 1813 OutRetAddr = getReturnAddressFrameIndex(DAG); 1814 1815 // Load the "old" Return address. 1816 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1817 false, false, 0); 1818 return SDValue(OutRetAddr.getNode(), 1); 1819} 1820 1821/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1822/// optimization is performed and it is required (FPDiff!=0). 1823static SDValue 1824EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1825 SDValue Chain, SDValue RetAddrFrIdx, 1826 bool Is64Bit, int FPDiff, DebugLoc dl) { 1827 // Store the return address to the appropriate stack slot. 1828 if (!FPDiff) return Chain; 1829 // Calculate the new stack slot for the return address. 1830 int SlotSize = Is64Bit ? 8 : 4; 1831 int NewReturnAddrFI = 1832 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1833 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1834 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1835 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1836 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1837 false, false, 0); 1838 return Chain; 1839} 1840 1841SDValue 1842X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1843 CallingConv::ID CallConv, bool isVarArg, 1844 bool &isTailCall, 1845 const SmallVectorImpl<ISD::OutputArg> &Outs, 1846 const SmallVectorImpl<SDValue> &OutVals, 1847 const SmallVectorImpl<ISD::InputArg> &Ins, 1848 DebugLoc dl, SelectionDAG &DAG, 1849 SmallVectorImpl<SDValue> &InVals) const { 1850 MachineFunction &MF = DAG.getMachineFunction(); 1851 bool Is64Bit = Subtarget->is64Bit(); 1852 bool IsStructRet = CallIsStructReturn(Outs); 1853 bool IsSibcall = false; 1854 1855 if (isTailCall) { 1856 // Check if it's really possible to do a tail call. 1857 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1858 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1859 Outs, OutVals, Ins, DAG); 1860 1861 // Sibcalls are automatically detected tailcalls which do not require 1862 // ABI changes. 1863 if (!GuaranteedTailCallOpt && isTailCall) 1864 IsSibcall = true; 1865 1866 if (isTailCall) 1867 ++NumTailCalls; 1868 } 1869 1870 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1871 "Var args not supported with calling convention fastcc or ghc"); 1872 1873 // Analyze operands of the call, assigning locations to each operand. 1874 SmallVector<CCValAssign, 16> ArgLocs; 1875 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1876 ArgLocs, *DAG.getContext()); 1877 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 1878 1879 // Get a count of how many bytes are to be pushed on the stack. 1880 unsigned NumBytes = CCInfo.getNextStackOffset(); 1881 if (IsSibcall) 1882 // This is a sibcall. The memory operands are available in caller's 1883 // own caller's stack. 1884 NumBytes = 0; 1885 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1886 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1887 1888 int FPDiff = 0; 1889 if (isTailCall && !IsSibcall) { 1890 // Lower arguments at fp - stackoffset + fpdiff. 1891 unsigned NumBytesCallerPushed = 1892 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1893 FPDiff = NumBytesCallerPushed - NumBytes; 1894 1895 // Set the delta of movement of the returnaddr stackslot. 1896 // But only set if delta is greater than previous delta. 1897 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1898 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1899 } 1900 1901 if (!IsSibcall) 1902 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1903 1904 SDValue RetAddrFrIdx; 1905 // Load return adress for tail calls. 1906 if (isTailCall && FPDiff) 1907 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1908 Is64Bit, FPDiff, dl); 1909 1910 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1911 SmallVector<SDValue, 8> MemOpChains; 1912 SDValue StackPtr; 1913 1914 // Walk the register/memloc assignments, inserting copies/loads. In the case 1915 // of tail call optimization arguments are handle later. 1916 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1917 CCValAssign &VA = ArgLocs[i]; 1918 EVT RegVT = VA.getLocVT(); 1919 SDValue Arg = OutVals[i]; 1920 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1921 bool isByVal = Flags.isByVal(); 1922 1923 // Promote the value if needed. 1924 switch (VA.getLocInfo()) { 1925 default: llvm_unreachable("Unknown loc info!"); 1926 case CCValAssign::Full: break; 1927 case CCValAssign::SExt: 1928 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1929 break; 1930 case CCValAssign::ZExt: 1931 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1932 break; 1933 case CCValAssign::AExt: 1934 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1935 // Special case: passing MMX values in XMM registers. 1936 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 1937 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1938 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1939 } else 1940 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1941 break; 1942 case CCValAssign::BCvt: 1943 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 1944 break; 1945 case CCValAssign::Indirect: { 1946 // Store the argument. 1947 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1948 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1949 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1950 MachinePointerInfo::getFixedStack(FI), 1951 false, false, 0); 1952 Arg = SpillSlot; 1953 break; 1954 } 1955 } 1956 1957 if (VA.isRegLoc()) { 1958 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1959 if (isVarArg && Subtarget->isTargetWin64()) { 1960 // Win64 ABI requires argument XMM reg to be copied to the corresponding 1961 // shadow reg if callee is a varargs function. 1962 unsigned ShadowReg = 0; 1963 switch (VA.getLocReg()) { 1964 case X86::XMM0: ShadowReg = X86::RCX; break; 1965 case X86::XMM1: ShadowReg = X86::RDX; break; 1966 case X86::XMM2: ShadowReg = X86::R8; break; 1967 case X86::XMM3: ShadowReg = X86::R9; break; 1968 } 1969 if (ShadowReg) 1970 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 1971 } 1972 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1973 assert(VA.isMemLoc()); 1974 if (StackPtr.getNode() == 0) 1975 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1976 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1977 dl, DAG, VA, Flags)); 1978 } 1979 } 1980 1981 if (!MemOpChains.empty()) 1982 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1983 &MemOpChains[0], MemOpChains.size()); 1984 1985 // Build a sequence of copy-to-reg nodes chained together with token chain 1986 // and flag operands which copy the outgoing args into registers. 1987 SDValue InFlag; 1988 // Tail call byval lowering might overwrite argument registers so in case of 1989 // tail call optimization the copies to registers are lowered later. 1990 if (!isTailCall) 1991 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1992 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1993 RegsToPass[i].second, InFlag); 1994 InFlag = Chain.getValue(1); 1995 } 1996 1997 if (Subtarget->isPICStyleGOT()) { 1998 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1999 // GOT pointer. 2000 if (!isTailCall) { 2001 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2002 DAG.getNode(X86ISD::GlobalBaseReg, 2003 DebugLoc(), getPointerTy()), 2004 InFlag); 2005 InFlag = Chain.getValue(1); 2006 } else { 2007 // If we are tail calling and generating PIC/GOT style code load the 2008 // address of the callee into ECX. The value in ecx is used as target of 2009 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2010 // for tail calls on PIC/GOT architectures. Normally we would just put the 2011 // address of GOT into ebx and then call target@PLT. But for tail calls 2012 // ebx would be restored (since ebx is callee saved) before jumping to the 2013 // target@PLT. 2014 2015 // Note: The actual moving to ECX is done further down. 2016 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2017 if (G && !G->getGlobal()->hasHiddenVisibility() && 2018 !G->getGlobal()->hasProtectedVisibility()) 2019 Callee = LowerGlobalAddress(Callee, DAG); 2020 else if (isa<ExternalSymbolSDNode>(Callee)) 2021 Callee = LowerExternalSymbol(Callee, DAG); 2022 } 2023 } 2024 2025 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2026 // From AMD64 ABI document: 2027 // For calls that may call functions that use varargs or stdargs 2028 // (prototype-less calls or calls to functions containing ellipsis (...) in 2029 // the declaration) %al is used as hidden argument to specify the number 2030 // of SSE registers used. The contents of %al do not need to match exactly 2031 // the number of registers, but must be an ubound on the number of SSE 2032 // registers used and is in the range 0 - 8 inclusive. 2033 2034 // Count the number of XMM registers allocated. 2035 static const unsigned XMMArgRegs[] = { 2036 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2037 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2038 }; 2039 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2040 assert((Subtarget->hasXMM() || !NumXMMRegs) 2041 && "SSE registers cannot be used when SSE is disabled"); 2042 2043 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2044 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2045 InFlag = Chain.getValue(1); 2046 } 2047 2048 2049 // For tail calls lower the arguments to the 'real' stack slot. 2050 if (isTailCall) { 2051 // Force all the incoming stack arguments to be loaded from the stack 2052 // before any new outgoing arguments are stored to the stack, because the 2053 // outgoing stack slots may alias the incoming argument stack slots, and 2054 // the alias isn't otherwise explicit. This is slightly more conservative 2055 // than necessary, because it means that each store effectively depends 2056 // on every argument instead of just those arguments it would clobber. 2057 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2058 2059 SmallVector<SDValue, 8> MemOpChains2; 2060 SDValue FIN; 2061 int FI = 0; 2062 // Do not flag preceeding copytoreg stuff together with the following stuff. 2063 InFlag = SDValue(); 2064 if (GuaranteedTailCallOpt) { 2065 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2066 CCValAssign &VA = ArgLocs[i]; 2067 if (VA.isRegLoc()) 2068 continue; 2069 assert(VA.isMemLoc()); 2070 SDValue Arg = OutVals[i]; 2071 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2072 // Create frame index. 2073 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2074 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2075 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2076 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2077 2078 if (Flags.isByVal()) { 2079 // Copy relative to framepointer. 2080 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2081 if (StackPtr.getNode() == 0) 2082 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2083 getPointerTy()); 2084 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2085 2086 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2087 ArgChain, 2088 Flags, DAG, dl)); 2089 } else { 2090 // Store relative to framepointer. 2091 MemOpChains2.push_back( 2092 DAG.getStore(ArgChain, dl, Arg, FIN, 2093 MachinePointerInfo::getFixedStack(FI), 2094 false, false, 0)); 2095 } 2096 } 2097 } 2098 2099 if (!MemOpChains2.empty()) 2100 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2101 &MemOpChains2[0], MemOpChains2.size()); 2102 2103 // Copy arguments to their registers. 2104 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2105 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2106 RegsToPass[i].second, InFlag); 2107 InFlag = Chain.getValue(1); 2108 } 2109 InFlag =SDValue(); 2110 2111 // Store the return address to the appropriate stack slot. 2112 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2113 FPDiff, dl); 2114 } 2115 2116 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2117 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2118 // In the 64-bit large code model, we have to make all calls 2119 // through a register, since the call instruction's 32-bit 2120 // pc-relative offset may not be large enough to hold the whole 2121 // address. 2122 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2123 // If the callee is a GlobalAddress node (quite common, every direct call 2124 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2125 // it. 2126 2127 // We should use extra load for direct calls to dllimported functions in 2128 // non-JIT mode. 2129 const GlobalValue *GV = G->getGlobal(); 2130 if (!GV->hasDLLImportLinkage()) { 2131 unsigned char OpFlags = 0; 2132 2133 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2134 // external symbols most go through the PLT in PIC mode. If the symbol 2135 // has hidden or protected visibility, or if it is static or local, then 2136 // we don't need to use the PLT - we can directly call it. 2137 if (Subtarget->isTargetELF() && 2138 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2139 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2140 OpFlags = X86II::MO_PLT; 2141 } else if (Subtarget->isPICStyleStubAny() && 2142 (GV->isDeclaration() || GV->isWeakForLinker()) && 2143 Subtarget->getDarwinVers() < 9) { 2144 // PC-relative references to external symbols should go through $stub, 2145 // unless we're building with the leopard linker or later, which 2146 // automatically synthesizes these stubs. 2147 OpFlags = X86II::MO_DARWIN_STUB; 2148 } 2149 2150 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2151 G->getOffset(), OpFlags); 2152 } 2153 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2154 unsigned char OpFlags = 0; 2155 2156 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2157 // external symbols should go through the PLT. 2158 if (Subtarget->isTargetELF() && 2159 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2160 OpFlags = X86II::MO_PLT; 2161 } else if (Subtarget->isPICStyleStubAny() && 2162 Subtarget->getDarwinVers() < 9) { 2163 // PC-relative references to external symbols should go through $stub, 2164 // unless we're building with the leopard linker or later, which 2165 // automatically synthesizes these stubs. 2166 OpFlags = X86II::MO_DARWIN_STUB; 2167 } 2168 2169 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2170 OpFlags); 2171 } 2172 2173 // Returns a chain & a flag for retval copy to use. 2174 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2175 SmallVector<SDValue, 8> Ops; 2176 2177 if (!IsSibcall && isTailCall) { 2178 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2179 DAG.getIntPtrConstant(0, true), InFlag); 2180 InFlag = Chain.getValue(1); 2181 } 2182 2183 Ops.push_back(Chain); 2184 Ops.push_back(Callee); 2185 2186 if (isTailCall) 2187 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2188 2189 // Add argument registers to the end of the list so that they are known live 2190 // into the call. 2191 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2192 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2193 RegsToPass[i].second.getValueType())); 2194 2195 // Add an implicit use GOT pointer in EBX. 2196 if (!isTailCall && Subtarget->isPICStyleGOT()) 2197 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2198 2199 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2200 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2201 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2202 2203 if (InFlag.getNode()) 2204 Ops.push_back(InFlag); 2205 2206 if (isTailCall) { 2207 // We used to do: 2208 //// If this is the first return lowered for this function, add the regs 2209 //// to the liveout set for the function. 2210 // This isn't right, although it's probably harmless on x86; liveouts 2211 // should be computed from returns not tail calls. Consider a void 2212 // function making a tail call to a function returning int. 2213 return DAG.getNode(X86ISD::TC_RETURN, dl, 2214 NodeTys, &Ops[0], Ops.size()); 2215 } 2216 2217 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2218 InFlag = Chain.getValue(1); 2219 2220 // Create the CALLSEQ_END node. 2221 unsigned NumBytesForCalleeToPush; 2222 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2223 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2224 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2225 // If this is a call to a struct-return function, the callee 2226 // pops the hidden struct pointer, so we have to push it back. 2227 // This is common for Darwin/X86, Linux & Mingw32 targets. 2228 NumBytesForCalleeToPush = 4; 2229 else 2230 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2231 2232 // Returns a flag for retval copy to use. 2233 if (!IsSibcall) { 2234 Chain = DAG.getCALLSEQ_END(Chain, 2235 DAG.getIntPtrConstant(NumBytes, true), 2236 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2237 true), 2238 InFlag); 2239 InFlag = Chain.getValue(1); 2240 } 2241 2242 // Handle result values, copying them out of physregs into vregs that we 2243 // return. 2244 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2245 Ins, dl, DAG, InVals); 2246} 2247 2248 2249//===----------------------------------------------------------------------===// 2250// Fast Calling Convention (tail call) implementation 2251//===----------------------------------------------------------------------===// 2252 2253// Like std call, callee cleans arguments, convention except that ECX is 2254// reserved for storing the tail called function address. Only 2 registers are 2255// free for argument passing (inreg). Tail call optimization is performed 2256// provided: 2257// * tailcallopt is enabled 2258// * caller/callee are fastcc 2259// On X86_64 architecture with GOT-style position independent code only local 2260// (within module) calls are supported at the moment. 2261// To keep the stack aligned according to platform abi the function 2262// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2263// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2264// If a tail called function callee has more arguments than the caller the 2265// caller needs to make sure that there is room to move the RETADDR to. This is 2266// achieved by reserving an area the size of the argument delta right after the 2267// original REtADDR, but before the saved framepointer or the spilled registers 2268// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2269// stack layout: 2270// arg1 2271// arg2 2272// RETADDR 2273// [ new RETADDR 2274// move area ] 2275// (possible EBP) 2276// ESI 2277// EDI 2278// local1 .. 2279 2280/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2281/// for a 16 byte align requirement. 2282unsigned 2283X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2284 SelectionDAG& DAG) const { 2285 MachineFunction &MF = DAG.getMachineFunction(); 2286 const TargetMachine &TM = MF.getTarget(); 2287 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2288 unsigned StackAlignment = TFI.getStackAlignment(); 2289 uint64_t AlignMask = StackAlignment - 1; 2290 int64_t Offset = StackSize; 2291 uint64_t SlotSize = TD->getPointerSize(); 2292 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2293 // Number smaller than 12 so just add the difference. 2294 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2295 } else { 2296 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2297 Offset = ((~AlignMask) & Offset) + StackAlignment + 2298 (StackAlignment-SlotSize); 2299 } 2300 return Offset; 2301} 2302 2303/// MatchingStackOffset - Return true if the given stack call argument is 2304/// already available in the same position (relatively) of the caller's 2305/// incoming argument stack. 2306static 2307bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2308 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2309 const X86InstrInfo *TII) { 2310 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2311 int FI = INT_MAX; 2312 if (Arg.getOpcode() == ISD::CopyFromReg) { 2313 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2314 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2315 return false; 2316 MachineInstr *Def = MRI->getVRegDef(VR); 2317 if (!Def) 2318 return false; 2319 if (!Flags.isByVal()) { 2320 if (!TII->isLoadFromStackSlot(Def, FI)) 2321 return false; 2322 } else { 2323 unsigned Opcode = Def->getOpcode(); 2324 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2325 Def->getOperand(1).isFI()) { 2326 FI = Def->getOperand(1).getIndex(); 2327 Bytes = Flags.getByValSize(); 2328 } else 2329 return false; 2330 } 2331 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2332 if (Flags.isByVal()) 2333 // ByVal argument is passed in as a pointer but it's now being 2334 // dereferenced. e.g. 2335 // define @foo(%struct.X* %A) { 2336 // tail call @bar(%struct.X* byval %A) 2337 // } 2338 return false; 2339 SDValue Ptr = Ld->getBasePtr(); 2340 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2341 if (!FINode) 2342 return false; 2343 FI = FINode->getIndex(); 2344 } else 2345 return false; 2346 2347 assert(FI != INT_MAX); 2348 if (!MFI->isFixedObjectIndex(FI)) 2349 return false; 2350 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2351} 2352 2353/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2354/// for tail call optimization. Targets which want to do tail call 2355/// optimization should implement this function. 2356bool 2357X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2358 CallingConv::ID CalleeCC, 2359 bool isVarArg, 2360 bool isCalleeStructRet, 2361 bool isCallerStructRet, 2362 const SmallVectorImpl<ISD::OutputArg> &Outs, 2363 const SmallVectorImpl<SDValue> &OutVals, 2364 const SmallVectorImpl<ISD::InputArg> &Ins, 2365 SelectionDAG& DAG) const { 2366 if (!IsTailCallConvention(CalleeCC) && 2367 CalleeCC != CallingConv::C) 2368 return false; 2369 2370 // If -tailcallopt is specified, make fastcc functions tail-callable. 2371 const MachineFunction &MF = DAG.getMachineFunction(); 2372 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2373 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2374 bool CCMatch = CallerCC == CalleeCC; 2375 2376 if (GuaranteedTailCallOpt) { 2377 if (IsTailCallConvention(CalleeCC) && CCMatch) 2378 return true; 2379 return false; 2380 } 2381 2382 // Look for obvious safe cases to perform tail call optimization that do not 2383 // require ABI changes. This is what gcc calls sibcall. 2384 2385 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2386 // emit a special epilogue. 2387 if (RegInfo->needsStackRealignment(MF)) 2388 return false; 2389 2390 // Do not sibcall optimize vararg calls unless the call site is not passing 2391 // any arguments. 2392 if (isVarArg && !Outs.empty()) 2393 return false; 2394 2395 // Also avoid sibcall optimization if either caller or callee uses struct 2396 // return semantics. 2397 if (isCalleeStructRet || isCallerStructRet) 2398 return false; 2399 2400 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2401 // Therefore if it's not used by the call it is not safe to optimize this into 2402 // a sibcall. 2403 bool Unused = false; 2404 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2405 if (!Ins[i].Used) { 2406 Unused = true; 2407 break; 2408 } 2409 } 2410 if (Unused) { 2411 SmallVector<CCValAssign, 16> RVLocs; 2412 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2413 RVLocs, *DAG.getContext()); 2414 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2415 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2416 CCValAssign &VA = RVLocs[i]; 2417 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2418 return false; 2419 } 2420 } 2421 2422 // If the calling conventions do not match, then we'd better make sure the 2423 // results are returned in the same way as what the caller expects. 2424 if (!CCMatch) { 2425 SmallVector<CCValAssign, 16> RVLocs1; 2426 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2427 RVLocs1, *DAG.getContext()); 2428 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2429 2430 SmallVector<CCValAssign, 16> RVLocs2; 2431 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2432 RVLocs2, *DAG.getContext()); 2433 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2434 2435 if (RVLocs1.size() != RVLocs2.size()) 2436 return false; 2437 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2438 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2439 return false; 2440 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2441 return false; 2442 if (RVLocs1[i].isRegLoc()) { 2443 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2444 return false; 2445 } else { 2446 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2447 return false; 2448 } 2449 } 2450 } 2451 2452 // If the callee takes no arguments then go on to check the results of the 2453 // call. 2454 if (!Outs.empty()) { 2455 // Check if stack adjustment is needed. For now, do not do this if any 2456 // argument is passed on the stack. 2457 SmallVector<CCValAssign, 16> ArgLocs; 2458 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2459 ArgLocs, *DAG.getContext()); 2460 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2461 if (CCInfo.getNextStackOffset()) { 2462 MachineFunction &MF = DAG.getMachineFunction(); 2463 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2464 return false; 2465 if (Subtarget->isTargetWin64()) 2466 // Win64 ABI has additional complications. 2467 return false; 2468 2469 // Check if the arguments are already laid out in the right way as 2470 // the caller's fixed stack objects. 2471 MachineFrameInfo *MFI = MF.getFrameInfo(); 2472 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2473 const X86InstrInfo *TII = 2474 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2475 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2476 CCValAssign &VA = ArgLocs[i]; 2477 SDValue Arg = OutVals[i]; 2478 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2479 if (VA.getLocInfo() == CCValAssign::Indirect) 2480 return false; 2481 if (!VA.isRegLoc()) { 2482 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2483 MFI, MRI, TII)) 2484 return false; 2485 } 2486 } 2487 } 2488 2489 // If the tailcall address may be in a register, then make sure it's 2490 // possible to register allocate for it. In 32-bit, the call address can 2491 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2492 // callee-saved registers are restored. These happen to be the same 2493 // registers used to pass 'inreg' arguments so watch out for those. 2494 if (!Subtarget->is64Bit() && 2495 !isa<GlobalAddressSDNode>(Callee) && 2496 !isa<ExternalSymbolSDNode>(Callee)) { 2497 unsigned NumInRegs = 0; 2498 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2499 CCValAssign &VA = ArgLocs[i]; 2500 if (!VA.isRegLoc()) 2501 continue; 2502 unsigned Reg = VA.getLocReg(); 2503 switch (Reg) { 2504 default: break; 2505 case X86::EAX: case X86::EDX: case X86::ECX: 2506 if (++NumInRegs == 3) 2507 return false; 2508 break; 2509 } 2510 } 2511 } 2512 } 2513 2514 // An stdcall caller is expected to clean up its arguments; the callee 2515 // isn't going to do that. 2516 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2517 return false; 2518 2519 return true; 2520} 2521 2522FastISel * 2523X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2524 return X86::createFastISel(funcInfo); 2525} 2526 2527 2528//===----------------------------------------------------------------------===// 2529// Other Lowering Hooks 2530//===----------------------------------------------------------------------===// 2531 2532static bool MayFoldLoad(SDValue Op) { 2533 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2534} 2535 2536static bool MayFoldIntoStore(SDValue Op) { 2537 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2538} 2539 2540static bool isTargetShuffle(unsigned Opcode) { 2541 switch(Opcode) { 2542 default: return false; 2543 case X86ISD::PSHUFD: 2544 case X86ISD::PSHUFHW: 2545 case X86ISD::PSHUFLW: 2546 case X86ISD::SHUFPD: 2547 case X86ISD::PALIGN: 2548 case X86ISD::SHUFPS: 2549 case X86ISD::MOVLHPS: 2550 case X86ISD::MOVLHPD: 2551 case X86ISD::MOVHLPS: 2552 case X86ISD::MOVLPS: 2553 case X86ISD::MOVLPD: 2554 case X86ISD::MOVSHDUP: 2555 case X86ISD::MOVSLDUP: 2556 case X86ISD::MOVDDUP: 2557 case X86ISD::MOVSS: 2558 case X86ISD::MOVSD: 2559 case X86ISD::UNPCKLPS: 2560 case X86ISD::UNPCKLPD: 2561 case X86ISD::PUNPCKLWD: 2562 case X86ISD::PUNPCKLBW: 2563 case X86ISD::PUNPCKLDQ: 2564 case X86ISD::PUNPCKLQDQ: 2565 case X86ISD::UNPCKHPS: 2566 case X86ISD::UNPCKHPD: 2567 case X86ISD::PUNPCKHWD: 2568 case X86ISD::PUNPCKHBW: 2569 case X86ISD::PUNPCKHDQ: 2570 case X86ISD::PUNPCKHQDQ: 2571 return true; 2572 } 2573 return false; 2574} 2575 2576static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2577 SDValue V1, SelectionDAG &DAG) { 2578 switch(Opc) { 2579 default: llvm_unreachable("Unknown x86 shuffle node"); 2580 case X86ISD::MOVSHDUP: 2581 case X86ISD::MOVSLDUP: 2582 case X86ISD::MOVDDUP: 2583 return DAG.getNode(Opc, dl, VT, V1); 2584 } 2585 2586 return SDValue(); 2587} 2588 2589static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2590 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2591 switch(Opc) { 2592 default: llvm_unreachable("Unknown x86 shuffle node"); 2593 case X86ISD::PSHUFD: 2594 case X86ISD::PSHUFHW: 2595 case X86ISD::PSHUFLW: 2596 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2597 } 2598 2599 return SDValue(); 2600} 2601 2602static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2603 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2604 switch(Opc) { 2605 default: llvm_unreachable("Unknown x86 shuffle node"); 2606 case X86ISD::PALIGN: 2607 case X86ISD::SHUFPD: 2608 case X86ISD::SHUFPS: 2609 return DAG.getNode(Opc, dl, VT, V1, V2, 2610 DAG.getConstant(TargetMask, MVT::i8)); 2611 } 2612 return SDValue(); 2613} 2614 2615static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2616 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2617 switch(Opc) { 2618 default: llvm_unreachable("Unknown x86 shuffle node"); 2619 case X86ISD::MOVLHPS: 2620 case X86ISD::MOVLHPD: 2621 case X86ISD::MOVHLPS: 2622 case X86ISD::MOVLPS: 2623 case X86ISD::MOVLPD: 2624 case X86ISD::MOVSS: 2625 case X86ISD::MOVSD: 2626 case X86ISD::UNPCKLPS: 2627 case X86ISD::UNPCKLPD: 2628 case X86ISD::PUNPCKLWD: 2629 case X86ISD::PUNPCKLBW: 2630 case X86ISD::PUNPCKLDQ: 2631 case X86ISD::PUNPCKLQDQ: 2632 case X86ISD::UNPCKHPS: 2633 case X86ISD::UNPCKHPD: 2634 case X86ISD::PUNPCKHWD: 2635 case X86ISD::PUNPCKHBW: 2636 case X86ISD::PUNPCKHDQ: 2637 case X86ISD::PUNPCKHQDQ: 2638 return DAG.getNode(Opc, dl, VT, V1, V2); 2639 } 2640 return SDValue(); 2641} 2642 2643SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2644 MachineFunction &MF = DAG.getMachineFunction(); 2645 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2646 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2647 2648 if (ReturnAddrIndex == 0) { 2649 // Set up a frame object for the return address. 2650 uint64_t SlotSize = TD->getPointerSize(); 2651 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2652 false); 2653 FuncInfo->setRAIndex(ReturnAddrIndex); 2654 } 2655 2656 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2657} 2658 2659 2660bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2661 bool hasSymbolicDisplacement) { 2662 // Offset should fit into 32 bit immediate field. 2663 if (!isInt<32>(Offset)) 2664 return false; 2665 2666 // If we don't have a symbolic displacement - we don't have any extra 2667 // restrictions. 2668 if (!hasSymbolicDisplacement) 2669 return true; 2670 2671 // FIXME: Some tweaks might be needed for medium code model. 2672 if (M != CodeModel::Small && M != CodeModel::Kernel) 2673 return false; 2674 2675 // For small code model we assume that latest object is 16MB before end of 31 2676 // bits boundary. We may also accept pretty large negative constants knowing 2677 // that all objects are in the positive half of address space. 2678 if (M == CodeModel::Small && Offset < 16*1024*1024) 2679 return true; 2680 2681 // For kernel code model we know that all object resist in the negative half 2682 // of 32bits address space. We may not accept negative offsets, since they may 2683 // be just off and we may accept pretty large positive ones. 2684 if (M == CodeModel::Kernel && Offset > 0) 2685 return true; 2686 2687 return false; 2688} 2689 2690/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2691/// specific condition code, returning the condition code and the LHS/RHS of the 2692/// comparison to make. 2693static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2694 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2695 if (!isFP) { 2696 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2697 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2698 // X > -1 -> X == 0, jump !sign. 2699 RHS = DAG.getConstant(0, RHS.getValueType()); 2700 return X86::COND_NS; 2701 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2702 // X < 0 -> X == 0, jump on sign. 2703 return X86::COND_S; 2704 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2705 // X < 1 -> X <= 0 2706 RHS = DAG.getConstant(0, RHS.getValueType()); 2707 return X86::COND_LE; 2708 } 2709 } 2710 2711 switch (SetCCOpcode) { 2712 default: llvm_unreachable("Invalid integer condition!"); 2713 case ISD::SETEQ: return X86::COND_E; 2714 case ISD::SETGT: return X86::COND_G; 2715 case ISD::SETGE: return X86::COND_GE; 2716 case ISD::SETLT: return X86::COND_L; 2717 case ISD::SETLE: return X86::COND_LE; 2718 case ISD::SETNE: return X86::COND_NE; 2719 case ISD::SETULT: return X86::COND_B; 2720 case ISD::SETUGT: return X86::COND_A; 2721 case ISD::SETULE: return X86::COND_BE; 2722 case ISD::SETUGE: return X86::COND_AE; 2723 } 2724 } 2725 2726 // First determine if it is required or is profitable to flip the operands. 2727 2728 // If LHS is a foldable load, but RHS is not, flip the condition. 2729 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2730 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2731 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2732 std::swap(LHS, RHS); 2733 } 2734 2735 switch (SetCCOpcode) { 2736 default: break; 2737 case ISD::SETOLT: 2738 case ISD::SETOLE: 2739 case ISD::SETUGT: 2740 case ISD::SETUGE: 2741 std::swap(LHS, RHS); 2742 break; 2743 } 2744 2745 // On a floating point condition, the flags are set as follows: 2746 // ZF PF CF op 2747 // 0 | 0 | 0 | X > Y 2748 // 0 | 0 | 1 | X < Y 2749 // 1 | 0 | 0 | X == Y 2750 // 1 | 1 | 1 | unordered 2751 switch (SetCCOpcode) { 2752 default: llvm_unreachable("Condcode should be pre-legalized away"); 2753 case ISD::SETUEQ: 2754 case ISD::SETEQ: return X86::COND_E; 2755 case ISD::SETOLT: // flipped 2756 case ISD::SETOGT: 2757 case ISD::SETGT: return X86::COND_A; 2758 case ISD::SETOLE: // flipped 2759 case ISD::SETOGE: 2760 case ISD::SETGE: return X86::COND_AE; 2761 case ISD::SETUGT: // flipped 2762 case ISD::SETULT: 2763 case ISD::SETLT: return X86::COND_B; 2764 case ISD::SETUGE: // flipped 2765 case ISD::SETULE: 2766 case ISD::SETLE: return X86::COND_BE; 2767 case ISD::SETONE: 2768 case ISD::SETNE: return X86::COND_NE; 2769 case ISD::SETUO: return X86::COND_P; 2770 case ISD::SETO: return X86::COND_NP; 2771 case ISD::SETOEQ: 2772 case ISD::SETUNE: return X86::COND_INVALID; 2773 } 2774} 2775 2776/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2777/// code. Current x86 isa includes the following FP cmov instructions: 2778/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2779static bool hasFPCMov(unsigned X86CC) { 2780 switch (X86CC) { 2781 default: 2782 return false; 2783 case X86::COND_B: 2784 case X86::COND_BE: 2785 case X86::COND_E: 2786 case X86::COND_P: 2787 case X86::COND_A: 2788 case X86::COND_AE: 2789 case X86::COND_NE: 2790 case X86::COND_NP: 2791 return true; 2792 } 2793} 2794 2795/// isFPImmLegal - Returns true if the target can instruction select the 2796/// specified FP immediate natively. If false, the legalizer will 2797/// materialize the FP immediate as a load from a constant pool. 2798bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2799 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2800 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2801 return true; 2802 } 2803 return false; 2804} 2805 2806/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2807/// the specified range (L, H]. 2808static bool isUndefOrInRange(int Val, int Low, int Hi) { 2809 return (Val < 0) || (Val >= Low && Val < Hi); 2810} 2811 2812/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2813/// specified value. 2814static bool isUndefOrEqual(int Val, int CmpVal) { 2815 if (Val < 0 || Val == CmpVal) 2816 return true; 2817 return false; 2818} 2819 2820/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2821/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2822/// the second operand. 2823static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2824 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 2825 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2826 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2827 return (Mask[0] < 2 && Mask[1] < 2); 2828 return false; 2829} 2830 2831bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2832 SmallVector<int, 8> M; 2833 N->getMask(M); 2834 return ::isPSHUFDMask(M, N->getValueType(0)); 2835} 2836 2837/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2838/// is suitable for input to PSHUFHW. 2839static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2840 if (VT != MVT::v8i16) 2841 return false; 2842 2843 // Lower quadword copied in order or undef. 2844 for (int i = 0; i != 4; ++i) 2845 if (Mask[i] >= 0 && Mask[i] != i) 2846 return false; 2847 2848 // Upper quadword shuffled. 2849 for (int i = 4; i != 8; ++i) 2850 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2851 return false; 2852 2853 return true; 2854} 2855 2856bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2857 SmallVector<int, 8> M; 2858 N->getMask(M); 2859 return ::isPSHUFHWMask(M, N->getValueType(0)); 2860} 2861 2862/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2863/// is suitable for input to PSHUFLW. 2864static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2865 if (VT != MVT::v8i16) 2866 return false; 2867 2868 // Upper quadword copied in order. 2869 for (int i = 4; i != 8; ++i) 2870 if (Mask[i] >= 0 && Mask[i] != i) 2871 return false; 2872 2873 // Lower quadword shuffled. 2874 for (int i = 0; i != 4; ++i) 2875 if (Mask[i] >= 4) 2876 return false; 2877 2878 return true; 2879} 2880 2881bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2882 SmallVector<int, 8> M; 2883 N->getMask(M); 2884 return ::isPSHUFLWMask(M, N->getValueType(0)); 2885} 2886 2887/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2888/// is suitable for input to PALIGNR. 2889static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2890 bool hasSSSE3) { 2891 int i, e = VT.getVectorNumElements(); 2892 2893 // Do not handle v2i64 / v2f64 shuffles with palignr. 2894 if (e < 4 || !hasSSSE3) 2895 return false; 2896 2897 for (i = 0; i != e; ++i) 2898 if (Mask[i] >= 0) 2899 break; 2900 2901 // All undef, not a palignr. 2902 if (i == e) 2903 return false; 2904 2905 // Determine if it's ok to perform a palignr with only the LHS, since we 2906 // don't have access to the actual shuffle elements to see if RHS is undef. 2907 bool Unary = Mask[i] < (int)e; 2908 bool NeedsUnary = false; 2909 2910 int s = Mask[i] - i; 2911 2912 // Check the rest of the elements to see if they are consecutive. 2913 for (++i; i != e; ++i) { 2914 int m = Mask[i]; 2915 if (m < 0) 2916 continue; 2917 2918 Unary = Unary && (m < (int)e); 2919 NeedsUnary = NeedsUnary || (m < s); 2920 2921 if (NeedsUnary && !Unary) 2922 return false; 2923 if (Unary && m != ((s+i) & (e-1))) 2924 return false; 2925 if (!Unary && m != (s+i)) 2926 return false; 2927 } 2928 return true; 2929} 2930 2931bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2932 SmallVector<int, 8> M; 2933 N->getMask(M); 2934 return ::isPALIGNRMask(M, N->getValueType(0), true); 2935} 2936 2937/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2938/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2939static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2940 int NumElems = VT.getVectorNumElements(); 2941 if (NumElems != 2 && NumElems != 4) 2942 return false; 2943 2944 int Half = NumElems / 2; 2945 for (int i = 0; i < Half; ++i) 2946 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2947 return false; 2948 for (int i = Half; i < NumElems; ++i) 2949 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2950 return false; 2951 2952 return true; 2953} 2954 2955bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2956 SmallVector<int, 8> M; 2957 N->getMask(M); 2958 return ::isSHUFPMask(M, N->getValueType(0)); 2959} 2960 2961/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2962/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2963/// half elements to come from vector 1 (which would equal the dest.) and 2964/// the upper half to come from vector 2. 2965static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2966 int NumElems = VT.getVectorNumElements(); 2967 2968 if (NumElems != 2 && NumElems != 4) 2969 return false; 2970 2971 int Half = NumElems / 2; 2972 for (int i = 0; i < Half; ++i) 2973 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2974 return false; 2975 for (int i = Half; i < NumElems; ++i) 2976 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2977 return false; 2978 return true; 2979} 2980 2981static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2982 SmallVector<int, 8> M; 2983 N->getMask(M); 2984 return isCommutedSHUFPMask(M, N->getValueType(0)); 2985} 2986 2987/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2988/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2989bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2990 if (N->getValueType(0).getVectorNumElements() != 4) 2991 return false; 2992 2993 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2994 return isUndefOrEqual(N->getMaskElt(0), 6) && 2995 isUndefOrEqual(N->getMaskElt(1), 7) && 2996 isUndefOrEqual(N->getMaskElt(2), 2) && 2997 isUndefOrEqual(N->getMaskElt(3), 3); 2998} 2999 3000/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3001/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3002/// <2, 3, 2, 3> 3003bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3004 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3005 3006 if (NumElems != 4) 3007 return false; 3008 3009 return isUndefOrEqual(N->getMaskElt(0), 2) && 3010 isUndefOrEqual(N->getMaskElt(1), 3) && 3011 isUndefOrEqual(N->getMaskElt(2), 2) && 3012 isUndefOrEqual(N->getMaskElt(3), 3); 3013} 3014 3015/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3016/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3017bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3018 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3019 3020 if (NumElems != 2 && NumElems != 4) 3021 return false; 3022 3023 for (unsigned i = 0; i < NumElems/2; ++i) 3024 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3025 return false; 3026 3027 for (unsigned i = NumElems/2; i < NumElems; ++i) 3028 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3029 return false; 3030 3031 return true; 3032} 3033 3034/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3035/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3036bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3037 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3038 3039 if (NumElems != 2 && NumElems != 4) 3040 return false; 3041 3042 for (unsigned i = 0; i < NumElems/2; ++i) 3043 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3044 return false; 3045 3046 for (unsigned i = 0; i < NumElems/2; ++i) 3047 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3048 return false; 3049 3050 return true; 3051} 3052 3053/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3054/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3055static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3056 bool V2IsSplat = false) { 3057 int NumElts = VT.getVectorNumElements(); 3058 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3059 return false; 3060 3061 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3062 int BitI = Mask[i]; 3063 int BitI1 = Mask[i+1]; 3064 if (!isUndefOrEqual(BitI, j)) 3065 return false; 3066 if (V2IsSplat) { 3067 if (!isUndefOrEqual(BitI1, NumElts)) 3068 return false; 3069 } else { 3070 if (!isUndefOrEqual(BitI1, j + NumElts)) 3071 return false; 3072 } 3073 } 3074 return true; 3075} 3076 3077bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3078 SmallVector<int, 8> M; 3079 N->getMask(M); 3080 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3081} 3082 3083/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3084/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3085static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3086 bool V2IsSplat = false) { 3087 int NumElts = VT.getVectorNumElements(); 3088 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3089 return false; 3090 3091 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3092 int BitI = Mask[i]; 3093 int BitI1 = Mask[i+1]; 3094 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3095 return false; 3096 if (V2IsSplat) { 3097 if (isUndefOrEqual(BitI1, NumElts)) 3098 return false; 3099 } else { 3100 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3101 return false; 3102 } 3103 } 3104 return true; 3105} 3106 3107bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3108 SmallVector<int, 8> M; 3109 N->getMask(M); 3110 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3111} 3112 3113/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3114/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3115/// <0, 0, 1, 1> 3116static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3117 int NumElems = VT.getVectorNumElements(); 3118 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3119 return false; 3120 3121 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3122 int BitI = Mask[i]; 3123 int BitI1 = Mask[i+1]; 3124 if (!isUndefOrEqual(BitI, j)) 3125 return false; 3126 if (!isUndefOrEqual(BitI1, j)) 3127 return false; 3128 } 3129 return true; 3130} 3131 3132bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3133 SmallVector<int, 8> M; 3134 N->getMask(M); 3135 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3136} 3137 3138/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3139/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3140/// <2, 2, 3, 3> 3141static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3142 int NumElems = VT.getVectorNumElements(); 3143 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3144 return false; 3145 3146 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3147 int BitI = Mask[i]; 3148 int BitI1 = Mask[i+1]; 3149 if (!isUndefOrEqual(BitI, j)) 3150 return false; 3151 if (!isUndefOrEqual(BitI1, j)) 3152 return false; 3153 } 3154 return true; 3155} 3156 3157bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3158 SmallVector<int, 8> M; 3159 N->getMask(M); 3160 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3161} 3162 3163/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3164/// specifies a shuffle of elements that is suitable for input to MOVSS, 3165/// MOVSD, and MOVD, i.e. setting the lowest element. 3166static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3167 if (VT.getVectorElementType().getSizeInBits() < 32) 3168 return false; 3169 3170 int NumElts = VT.getVectorNumElements(); 3171 3172 if (!isUndefOrEqual(Mask[0], NumElts)) 3173 return false; 3174 3175 for (int i = 1; i < NumElts; ++i) 3176 if (!isUndefOrEqual(Mask[i], i)) 3177 return false; 3178 3179 return true; 3180} 3181 3182bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3183 SmallVector<int, 8> M; 3184 N->getMask(M); 3185 return ::isMOVLMask(M, N->getValueType(0)); 3186} 3187 3188/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3189/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3190/// element of vector 2 and the other elements to come from vector 1 in order. 3191static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3192 bool V2IsSplat = false, bool V2IsUndef = false) { 3193 int NumOps = VT.getVectorNumElements(); 3194 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3195 return false; 3196 3197 if (!isUndefOrEqual(Mask[0], 0)) 3198 return false; 3199 3200 for (int i = 1; i < NumOps; ++i) 3201 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3202 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3203 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3204 return false; 3205 3206 return true; 3207} 3208 3209static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3210 bool V2IsUndef = false) { 3211 SmallVector<int, 8> M; 3212 N->getMask(M); 3213 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3214} 3215 3216/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3217/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3218bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3219 if (N->getValueType(0).getVectorNumElements() != 4) 3220 return false; 3221 3222 // Expect 1, 1, 3, 3 3223 for (unsigned i = 0; i < 2; ++i) { 3224 int Elt = N->getMaskElt(i); 3225 if (Elt >= 0 && Elt != 1) 3226 return false; 3227 } 3228 3229 bool HasHi = false; 3230 for (unsigned i = 2; i < 4; ++i) { 3231 int Elt = N->getMaskElt(i); 3232 if (Elt >= 0 && Elt != 3) 3233 return false; 3234 if (Elt == 3) 3235 HasHi = true; 3236 } 3237 // Don't use movshdup if it can be done with a shufps. 3238 // FIXME: verify that matching u, u, 3, 3 is what we want. 3239 return HasHi; 3240} 3241 3242/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3243/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3244bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3245 if (N->getValueType(0).getVectorNumElements() != 4) 3246 return false; 3247 3248 // Expect 0, 0, 2, 2 3249 for (unsigned i = 0; i < 2; ++i) 3250 if (N->getMaskElt(i) > 0) 3251 return false; 3252 3253 bool HasHi = false; 3254 for (unsigned i = 2; i < 4; ++i) { 3255 int Elt = N->getMaskElt(i); 3256 if (Elt >= 0 && Elt != 2) 3257 return false; 3258 if (Elt == 2) 3259 HasHi = true; 3260 } 3261 // Don't use movsldup if it can be done with a shufps. 3262 return HasHi; 3263} 3264 3265/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3266/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3267bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3268 int e = N->getValueType(0).getVectorNumElements() / 2; 3269 3270 for (int i = 0; i < e; ++i) 3271 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3272 return false; 3273 for (int i = 0; i < e; ++i) 3274 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3275 return false; 3276 return true; 3277} 3278 3279/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3280/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3281unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3282 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3283 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3284 3285 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3286 unsigned Mask = 0; 3287 for (int i = 0; i < NumOperands; ++i) { 3288 int Val = SVOp->getMaskElt(NumOperands-i-1); 3289 if (Val < 0) Val = 0; 3290 if (Val >= NumOperands) Val -= NumOperands; 3291 Mask |= Val; 3292 if (i != NumOperands - 1) 3293 Mask <<= Shift; 3294 } 3295 return Mask; 3296} 3297 3298/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3299/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3300unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3301 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3302 unsigned Mask = 0; 3303 // 8 nodes, but we only care about the last 4. 3304 for (unsigned i = 7; i >= 4; --i) { 3305 int Val = SVOp->getMaskElt(i); 3306 if (Val >= 0) 3307 Mask |= (Val - 4); 3308 if (i != 4) 3309 Mask <<= 2; 3310 } 3311 return Mask; 3312} 3313 3314/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3315/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3316unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3317 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3318 unsigned Mask = 0; 3319 // 8 nodes, but we only care about the first 4. 3320 for (int i = 3; i >= 0; --i) { 3321 int Val = SVOp->getMaskElt(i); 3322 if (Val >= 0) 3323 Mask |= Val; 3324 if (i != 0) 3325 Mask <<= 2; 3326 } 3327 return Mask; 3328} 3329 3330/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3331/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3332unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3333 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3334 EVT VVT = N->getValueType(0); 3335 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3336 int Val = 0; 3337 3338 unsigned i, e; 3339 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3340 Val = SVOp->getMaskElt(i); 3341 if (Val >= 0) 3342 break; 3343 } 3344 return (Val - i) * EltSize; 3345} 3346 3347/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3348/// constant +0.0. 3349bool X86::isZeroNode(SDValue Elt) { 3350 return ((isa<ConstantSDNode>(Elt) && 3351 cast<ConstantSDNode>(Elt)->isNullValue()) || 3352 (isa<ConstantFPSDNode>(Elt) && 3353 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3354} 3355 3356/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3357/// their permute mask. 3358static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3359 SelectionDAG &DAG) { 3360 EVT VT = SVOp->getValueType(0); 3361 unsigned NumElems = VT.getVectorNumElements(); 3362 SmallVector<int, 8> MaskVec; 3363 3364 for (unsigned i = 0; i != NumElems; ++i) { 3365 int idx = SVOp->getMaskElt(i); 3366 if (idx < 0) 3367 MaskVec.push_back(idx); 3368 else if (idx < (int)NumElems) 3369 MaskVec.push_back(idx + NumElems); 3370 else 3371 MaskVec.push_back(idx - NumElems); 3372 } 3373 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3374 SVOp->getOperand(0), &MaskVec[0]); 3375} 3376 3377/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3378/// the two vector operands have swapped position. 3379static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3380 unsigned NumElems = VT.getVectorNumElements(); 3381 for (unsigned i = 0; i != NumElems; ++i) { 3382 int idx = Mask[i]; 3383 if (idx < 0) 3384 continue; 3385 else if (idx < (int)NumElems) 3386 Mask[i] = idx + NumElems; 3387 else 3388 Mask[i] = idx - NumElems; 3389 } 3390} 3391 3392/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3393/// match movhlps. The lower half elements should come from upper half of 3394/// V1 (and in order), and the upper half elements should come from the upper 3395/// half of V2 (and in order). 3396static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3397 if (Op->getValueType(0).getVectorNumElements() != 4) 3398 return false; 3399 for (unsigned i = 0, e = 2; i != e; ++i) 3400 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3401 return false; 3402 for (unsigned i = 2; i != 4; ++i) 3403 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3404 return false; 3405 return true; 3406} 3407 3408/// isScalarLoadToVector - Returns true if the node is a scalar load that 3409/// is promoted to a vector. It also returns the LoadSDNode by reference if 3410/// required. 3411static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3412 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3413 return false; 3414 N = N->getOperand(0).getNode(); 3415 if (!ISD::isNON_EXTLoad(N)) 3416 return false; 3417 if (LD) 3418 *LD = cast<LoadSDNode>(N); 3419 return true; 3420} 3421 3422/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3423/// match movlp{s|d}. The lower half elements should come from lower half of 3424/// V1 (and in order), and the upper half elements should come from the upper 3425/// half of V2 (and in order). And since V1 will become the source of the 3426/// MOVLP, it must be either a vector load or a scalar load to vector. 3427static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3428 ShuffleVectorSDNode *Op) { 3429 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3430 return false; 3431 // Is V2 is a vector load, don't do this transformation. We will try to use 3432 // load folding shufps op. 3433 if (ISD::isNON_EXTLoad(V2)) 3434 return false; 3435 3436 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3437 3438 if (NumElems != 2 && NumElems != 4) 3439 return false; 3440 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3441 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3442 return false; 3443 for (unsigned i = NumElems/2; i != NumElems; ++i) 3444 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3445 return false; 3446 return true; 3447} 3448 3449/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3450/// all the same. 3451static bool isSplatVector(SDNode *N) { 3452 if (N->getOpcode() != ISD::BUILD_VECTOR) 3453 return false; 3454 3455 SDValue SplatValue = N->getOperand(0); 3456 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3457 if (N->getOperand(i) != SplatValue) 3458 return false; 3459 return true; 3460} 3461 3462/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3463/// to an zero vector. 3464/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3465static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3466 SDValue V1 = N->getOperand(0); 3467 SDValue V2 = N->getOperand(1); 3468 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3469 for (unsigned i = 0; i != NumElems; ++i) { 3470 int Idx = N->getMaskElt(i); 3471 if (Idx >= (int)NumElems) { 3472 unsigned Opc = V2.getOpcode(); 3473 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3474 continue; 3475 if (Opc != ISD::BUILD_VECTOR || 3476 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3477 return false; 3478 } else if (Idx >= 0) { 3479 unsigned Opc = V1.getOpcode(); 3480 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3481 continue; 3482 if (Opc != ISD::BUILD_VECTOR || 3483 !X86::isZeroNode(V1.getOperand(Idx))) 3484 return false; 3485 } 3486 } 3487 return true; 3488} 3489 3490/// getZeroVector - Returns a vector of specified type with all zero elements. 3491/// 3492static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3493 DebugLoc dl) { 3494 assert(VT.isVector() && "Expected a vector type"); 3495 3496 // Always build SSE zero vectors as <4 x i32> bitcasted 3497 // to their dest type. This ensures they get CSE'd. 3498 SDValue Vec; 3499 if (VT.getSizeInBits() == 128) { // SSE 3500 if (HasSSE2) { // SSE2 3501 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3502 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3503 } else { // SSE1 3504 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3505 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3506 } 3507 } else if (VT.getSizeInBits() == 256) { // AVX 3508 // 256-bit logic and arithmetic instructions in AVX are 3509 // all floating-point, no support for integer ops. Default 3510 // to emitting fp zeroed vectors then. 3511 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3512 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3513 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3514 } 3515 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3516} 3517 3518/// getOnesVector - Returns a vector of specified type with all bits set. 3519/// 3520static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3521 assert(VT.isVector() && "Expected a vector type"); 3522 3523 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3524 // type. This ensures they get CSE'd. 3525 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3526 SDValue Vec; 3527 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3528 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3529} 3530 3531 3532/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3533/// that point to V2 points to its first element. 3534static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3535 EVT VT = SVOp->getValueType(0); 3536 unsigned NumElems = VT.getVectorNumElements(); 3537 3538 bool Changed = false; 3539 SmallVector<int, 8> MaskVec; 3540 SVOp->getMask(MaskVec); 3541 3542 for (unsigned i = 0; i != NumElems; ++i) { 3543 if (MaskVec[i] > (int)NumElems) { 3544 MaskVec[i] = NumElems; 3545 Changed = true; 3546 } 3547 } 3548 if (Changed) 3549 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3550 SVOp->getOperand(1), &MaskVec[0]); 3551 return SDValue(SVOp, 0); 3552} 3553 3554/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3555/// operation of specified width. 3556static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3557 SDValue V2) { 3558 unsigned NumElems = VT.getVectorNumElements(); 3559 SmallVector<int, 8> Mask; 3560 Mask.push_back(NumElems); 3561 for (unsigned i = 1; i != NumElems; ++i) 3562 Mask.push_back(i); 3563 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3564} 3565 3566/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3567static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3568 SDValue V2) { 3569 unsigned NumElems = VT.getVectorNumElements(); 3570 SmallVector<int, 8> Mask; 3571 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3572 Mask.push_back(i); 3573 Mask.push_back(i + NumElems); 3574 } 3575 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3576} 3577 3578/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3579static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3580 SDValue V2) { 3581 unsigned NumElems = VT.getVectorNumElements(); 3582 unsigned Half = NumElems/2; 3583 SmallVector<int, 8> Mask; 3584 for (unsigned i = 0; i != Half; ++i) { 3585 Mask.push_back(i + Half); 3586 Mask.push_back(i + NumElems + Half); 3587 } 3588 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3589} 3590 3591/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3592static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3593 EVT PVT = MVT::v4f32; 3594 EVT VT = SV->getValueType(0); 3595 DebugLoc dl = SV->getDebugLoc(); 3596 SDValue V1 = SV->getOperand(0); 3597 int NumElems = VT.getVectorNumElements(); 3598 int EltNo = SV->getSplatIndex(); 3599 3600 // unpack elements to the correct location 3601 while (NumElems > 4) { 3602 if (EltNo < NumElems/2) { 3603 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3604 } else { 3605 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3606 EltNo -= NumElems/2; 3607 } 3608 NumElems >>= 1; 3609 } 3610 3611 // Perform the splat. 3612 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3613 V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1); 3614 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3615 return DAG.getNode(ISD::BITCAST, dl, VT, V1); 3616} 3617 3618/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3619/// vector of zero or undef vector. This produces a shuffle where the low 3620/// element of V2 is swizzled into the zero/undef vector, landing at element 3621/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3622static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3623 bool isZero, bool HasSSE2, 3624 SelectionDAG &DAG) { 3625 EVT VT = V2.getValueType(); 3626 SDValue V1 = isZero 3627 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3628 unsigned NumElems = VT.getVectorNumElements(); 3629 SmallVector<int, 16> MaskVec; 3630 for (unsigned i = 0; i != NumElems; ++i) 3631 // If this is the insertion idx, put the low elt of V2 here. 3632 MaskVec.push_back(i == Idx ? NumElems : i); 3633 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3634} 3635 3636/// getShuffleScalarElt - Returns the scalar element that will make up the ith 3637/// element of the result of the vector shuffle. 3638SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 3639 unsigned Depth) { 3640 if (Depth == 6) 3641 return SDValue(); // Limit search depth. 3642 3643 SDValue V = SDValue(N, 0); 3644 EVT VT = V.getValueType(); 3645 unsigned Opcode = V.getOpcode(); 3646 3647 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 3648 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 3649 Index = SV->getMaskElt(Index); 3650 3651 if (Index < 0) 3652 return DAG.getUNDEF(VT.getVectorElementType()); 3653 3654 int NumElems = VT.getVectorNumElements(); 3655 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 3656 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 3657 } 3658 3659 // Recurse into target specific vector shuffles to find scalars. 3660 if (isTargetShuffle(Opcode)) { 3661 int NumElems = VT.getVectorNumElements(); 3662 SmallVector<unsigned, 16> ShuffleMask; 3663 SDValue ImmN; 3664 3665 switch(Opcode) { 3666 case X86ISD::SHUFPS: 3667 case X86ISD::SHUFPD: 3668 ImmN = N->getOperand(N->getNumOperands()-1); 3669 DecodeSHUFPSMask(NumElems, 3670 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3671 ShuffleMask); 3672 break; 3673 case X86ISD::PUNPCKHBW: 3674 case X86ISD::PUNPCKHWD: 3675 case X86ISD::PUNPCKHDQ: 3676 case X86ISD::PUNPCKHQDQ: 3677 DecodePUNPCKHMask(NumElems, ShuffleMask); 3678 break; 3679 case X86ISD::UNPCKHPS: 3680 case X86ISD::UNPCKHPD: 3681 DecodeUNPCKHPMask(NumElems, ShuffleMask); 3682 break; 3683 case X86ISD::PUNPCKLBW: 3684 case X86ISD::PUNPCKLWD: 3685 case X86ISD::PUNPCKLDQ: 3686 case X86ISD::PUNPCKLQDQ: 3687 DecodePUNPCKLMask(NumElems, ShuffleMask); 3688 break; 3689 case X86ISD::UNPCKLPS: 3690 case X86ISD::UNPCKLPD: 3691 DecodeUNPCKLPMask(NumElems, ShuffleMask); 3692 break; 3693 case X86ISD::MOVHLPS: 3694 DecodeMOVHLPSMask(NumElems, ShuffleMask); 3695 break; 3696 case X86ISD::MOVLHPS: 3697 DecodeMOVLHPSMask(NumElems, ShuffleMask); 3698 break; 3699 case X86ISD::PSHUFD: 3700 ImmN = N->getOperand(N->getNumOperands()-1); 3701 DecodePSHUFMask(NumElems, 3702 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3703 ShuffleMask); 3704 break; 3705 case X86ISD::PSHUFHW: 3706 ImmN = N->getOperand(N->getNumOperands()-1); 3707 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3708 ShuffleMask); 3709 break; 3710 case X86ISD::PSHUFLW: 3711 ImmN = N->getOperand(N->getNumOperands()-1); 3712 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3713 ShuffleMask); 3714 break; 3715 case X86ISD::MOVSS: 3716 case X86ISD::MOVSD: { 3717 // The index 0 always comes from the first element of the second source, 3718 // this is why MOVSS and MOVSD are used in the first place. The other 3719 // elements come from the other positions of the first source vector. 3720 unsigned OpNum = (Index == 0) ? 1 : 0; 3721 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 3722 Depth+1); 3723 } 3724 default: 3725 assert("not implemented for target shuffle node"); 3726 return SDValue(); 3727 } 3728 3729 Index = ShuffleMask[Index]; 3730 if (Index < 0) 3731 return DAG.getUNDEF(VT.getVectorElementType()); 3732 3733 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 3734 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 3735 Depth+1); 3736 } 3737 3738 // Actual nodes that may contain scalar elements 3739 if (Opcode == ISD::BITCAST) { 3740 V = V.getOperand(0); 3741 EVT SrcVT = V.getValueType(); 3742 unsigned NumElems = VT.getVectorNumElements(); 3743 3744 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 3745 return SDValue(); 3746 } 3747 3748 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 3749 return (Index == 0) ? V.getOperand(0) 3750 : DAG.getUNDEF(VT.getVectorElementType()); 3751 3752 if (V.getOpcode() == ISD::BUILD_VECTOR) 3753 return V.getOperand(Index); 3754 3755 return SDValue(); 3756} 3757 3758/// getNumOfConsecutiveZeros - Return the number of elements of a vector 3759/// shuffle operation which come from a consecutively from a zero. The 3760/// search can start in two diferent directions, from left or right. 3761static 3762unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 3763 bool ZerosFromLeft, SelectionDAG &DAG) { 3764 int i = 0; 3765 3766 while (i < NumElems) { 3767 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 3768 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 3769 if (!(Elt.getNode() && 3770 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 3771 break; 3772 ++i; 3773 } 3774 3775 return i; 3776} 3777 3778/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 3779/// MaskE correspond consecutively to elements from one of the vector operands, 3780/// starting from its index OpIdx. Also tell OpNum which source vector operand. 3781static 3782bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 3783 int OpIdx, int NumElems, unsigned &OpNum) { 3784 bool SeenV1 = false; 3785 bool SeenV2 = false; 3786 3787 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 3788 int Idx = SVOp->getMaskElt(i); 3789 // Ignore undef indicies 3790 if (Idx < 0) 3791 continue; 3792 3793 if (Idx < NumElems) 3794 SeenV1 = true; 3795 else 3796 SeenV2 = true; 3797 3798 // Only accept consecutive elements from the same vector 3799 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 3800 return false; 3801 } 3802 3803 OpNum = SeenV1 ? 0 : 1; 3804 return true; 3805} 3806 3807/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 3808/// logical left shift of a vector. 3809static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3810 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3811 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3812 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3813 false /* check zeros from right */, DAG); 3814 unsigned OpSrc; 3815 3816 if (!NumZeros) 3817 return false; 3818 3819 // Considering the elements in the mask that are not consecutive zeros, 3820 // check if they consecutively come from only one of the source vectors. 3821 // 3822 // V1 = {X, A, B, C} 0 3823 // \ \ \ / 3824 // vector_shuffle V1, V2 <1, 2, 3, X> 3825 // 3826 if (!isShuffleMaskConsecutive(SVOp, 3827 0, // Mask Start Index 3828 NumElems-NumZeros-1, // Mask End Index 3829 NumZeros, // Where to start looking in the src vector 3830 NumElems, // Number of elements in vector 3831 OpSrc)) // Which source operand ? 3832 return false; 3833 3834 isLeft = false; 3835 ShAmt = NumZeros; 3836 ShVal = SVOp->getOperand(OpSrc); 3837 return true; 3838} 3839 3840/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 3841/// logical left shift of a vector. 3842static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3843 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3844 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3845 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3846 true /* check zeros from left */, DAG); 3847 unsigned OpSrc; 3848 3849 if (!NumZeros) 3850 return false; 3851 3852 // Considering the elements in the mask that are not consecutive zeros, 3853 // check if they consecutively come from only one of the source vectors. 3854 // 3855 // 0 { A, B, X, X } = V2 3856 // / \ / / 3857 // vector_shuffle V1, V2 <X, X, 4, 5> 3858 // 3859 if (!isShuffleMaskConsecutive(SVOp, 3860 NumZeros, // Mask Start Index 3861 NumElems-1, // Mask End Index 3862 0, // Where to start looking in the src vector 3863 NumElems, // Number of elements in vector 3864 OpSrc)) // Which source operand ? 3865 return false; 3866 3867 isLeft = true; 3868 ShAmt = NumZeros; 3869 ShVal = SVOp->getOperand(OpSrc); 3870 return true; 3871} 3872 3873/// isVectorShift - Returns true if the shuffle can be implemented as a 3874/// logical left or right shift of a vector. 3875static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3876 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3877 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 3878 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 3879 return true; 3880 3881 return false; 3882} 3883 3884/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3885/// 3886static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3887 unsigned NumNonZero, unsigned NumZero, 3888 SelectionDAG &DAG, 3889 const TargetLowering &TLI) { 3890 if (NumNonZero > 8) 3891 return SDValue(); 3892 3893 DebugLoc dl = Op.getDebugLoc(); 3894 SDValue V(0, 0); 3895 bool First = true; 3896 for (unsigned i = 0; i < 16; ++i) { 3897 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3898 if (ThisIsNonZero && First) { 3899 if (NumZero) 3900 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3901 else 3902 V = DAG.getUNDEF(MVT::v8i16); 3903 First = false; 3904 } 3905 3906 if ((i & 1) != 0) { 3907 SDValue ThisElt(0, 0), LastElt(0, 0); 3908 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3909 if (LastIsNonZero) { 3910 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3911 MVT::i16, Op.getOperand(i-1)); 3912 } 3913 if (ThisIsNonZero) { 3914 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3915 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3916 ThisElt, DAG.getConstant(8, MVT::i8)); 3917 if (LastIsNonZero) 3918 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3919 } else 3920 ThisElt = LastElt; 3921 3922 if (ThisElt.getNode()) 3923 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3924 DAG.getIntPtrConstant(i/2)); 3925 } 3926 } 3927 3928 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 3929} 3930 3931/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3932/// 3933static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3934 unsigned NumNonZero, unsigned NumZero, 3935 SelectionDAG &DAG, 3936 const TargetLowering &TLI) { 3937 if (NumNonZero > 4) 3938 return SDValue(); 3939 3940 DebugLoc dl = Op.getDebugLoc(); 3941 SDValue V(0, 0); 3942 bool First = true; 3943 for (unsigned i = 0; i < 8; ++i) { 3944 bool isNonZero = (NonZeros & (1 << i)) != 0; 3945 if (isNonZero) { 3946 if (First) { 3947 if (NumZero) 3948 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3949 else 3950 V = DAG.getUNDEF(MVT::v8i16); 3951 First = false; 3952 } 3953 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3954 MVT::v8i16, V, Op.getOperand(i), 3955 DAG.getIntPtrConstant(i)); 3956 } 3957 } 3958 3959 return V; 3960} 3961 3962/// getVShift - Return a vector logical shift node. 3963/// 3964static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3965 unsigned NumBits, SelectionDAG &DAG, 3966 const TargetLowering &TLI, DebugLoc dl) { 3967 EVT ShVT = MVT::v2i64; 3968 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3969 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 3970 return DAG.getNode(ISD::BITCAST, dl, VT, 3971 DAG.getNode(Opc, dl, ShVT, SrcOp, 3972 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3973} 3974 3975SDValue 3976X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3977 SelectionDAG &DAG) const { 3978 3979 // Check if the scalar load can be widened into a vector load. And if 3980 // the address is "base + cst" see if the cst can be "absorbed" into 3981 // the shuffle mask. 3982 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3983 SDValue Ptr = LD->getBasePtr(); 3984 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3985 return SDValue(); 3986 EVT PVT = LD->getValueType(0); 3987 if (PVT != MVT::i32 && PVT != MVT::f32) 3988 return SDValue(); 3989 3990 int FI = -1; 3991 int64_t Offset = 0; 3992 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3993 FI = FINode->getIndex(); 3994 Offset = 0; 3995 } else if (Ptr.getOpcode() == ISD::ADD && 3996 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3997 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3998 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3999 Offset = Ptr.getConstantOperandVal(1); 4000 Ptr = Ptr.getOperand(0); 4001 } else { 4002 return SDValue(); 4003 } 4004 4005 SDValue Chain = LD->getChain(); 4006 // Make sure the stack object alignment is at least 16. 4007 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4008 if (DAG.InferPtrAlignment(Ptr) < 16) { 4009 if (MFI->isFixedObjectIndex(FI)) { 4010 // Can't change the alignment. FIXME: It's possible to compute 4011 // the exact stack offset and reference FI + adjust offset instead. 4012 // If someone *really* cares about this. That's the way to implement it. 4013 return SDValue(); 4014 } else { 4015 MFI->setObjectAlignment(FI, 16); 4016 } 4017 } 4018 4019 // (Offset % 16) must be multiple of 4. Then address is then 4020 // Ptr + (Offset & ~15). 4021 if (Offset < 0) 4022 return SDValue(); 4023 if ((Offset % 16) & 3) 4024 return SDValue(); 4025 int64_t StartOffset = Offset & ~15; 4026 if (StartOffset) 4027 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4028 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4029 4030 int EltNo = (Offset - StartOffset) >> 2; 4031 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 4032 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 4033 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, 4034 LD->getPointerInfo().getWithOffset(StartOffset), 4035 false, false, 0); 4036 // Canonicalize it to a v4i32 shuffle. 4037 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 4038 return DAG.getNode(ISD::BITCAST, dl, VT, 4039 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 4040 DAG.getUNDEF(MVT::v4i32),&Mask[0])); 4041 } 4042 4043 return SDValue(); 4044} 4045 4046/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4047/// vector of type 'VT', see if the elements can be replaced by a single large 4048/// load which has the same value as a build_vector whose operands are 'elts'. 4049/// 4050/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4051/// 4052/// FIXME: we'd also like to handle the case where the last elements are zero 4053/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4054/// There's even a handy isZeroNode for that purpose. 4055static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4056 DebugLoc &DL, SelectionDAG &DAG) { 4057 EVT EltVT = VT.getVectorElementType(); 4058 unsigned NumElems = Elts.size(); 4059 4060 LoadSDNode *LDBase = NULL; 4061 unsigned LastLoadedElt = -1U; 4062 4063 // For each element in the initializer, see if we've found a load or an undef. 4064 // If we don't find an initial load element, or later load elements are 4065 // non-consecutive, bail out. 4066 for (unsigned i = 0; i < NumElems; ++i) { 4067 SDValue Elt = Elts[i]; 4068 4069 if (!Elt.getNode() || 4070 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4071 return SDValue(); 4072 if (!LDBase) { 4073 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4074 return SDValue(); 4075 LDBase = cast<LoadSDNode>(Elt.getNode()); 4076 LastLoadedElt = i; 4077 continue; 4078 } 4079 if (Elt.getOpcode() == ISD::UNDEF) 4080 continue; 4081 4082 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4083 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4084 return SDValue(); 4085 LastLoadedElt = i; 4086 } 4087 4088 // If we have found an entire vector of loads and undefs, then return a large 4089 // load of the entire vector width starting at the base pointer. If we found 4090 // consecutive loads for the low half, generate a vzext_load node. 4091 if (LastLoadedElt == NumElems - 1) { 4092 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4093 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4094 LDBase->getPointerInfo(), 4095 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4096 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4097 LDBase->getPointerInfo(), 4098 LDBase->isVolatile(), LDBase->isNonTemporal(), 4099 LDBase->getAlignment()); 4100 } else if (NumElems == 4 && LastLoadedElt == 1) { 4101 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4102 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4103 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4104 Ops, 2, MVT::i32, 4105 LDBase->getMemOperand()); 4106 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4107 } 4108 return SDValue(); 4109} 4110 4111SDValue 4112X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4113 DebugLoc dl = Op.getDebugLoc(); 4114 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1. 4115 // All one's are handled with pcmpeqd. In AVX, zero's are handled with 4116 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 4117 // is present, so AllOnes is ignored. 4118 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 4119 (Op.getValueType().getSizeInBits() != 256 && 4120 ISD::isBuildVectorAllOnes(Op.getNode()))) { 4121 // Canonicalize this to <4 x i32> (SSE) to 4122 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 4123 // eliminated on x86-32 hosts. 4124 if (Op.getValueType() == MVT::v4i32) 4125 return Op; 4126 4127 if (ISD::isBuildVectorAllOnes(Op.getNode())) 4128 return getOnesVector(Op.getValueType(), DAG, dl); 4129 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4130 } 4131 4132 EVT VT = Op.getValueType(); 4133 EVT ExtVT = VT.getVectorElementType(); 4134 unsigned EVTBits = ExtVT.getSizeInBits(); 4135 4136 unsigned NumElems = Op.getNumOperands(); 4137 unsigned NumZero = 0; 4138 unsigned NumNonZero = 0; 4139 unsigned NonZeros = 0; 4140 bool IsAllConstants = true; 4141 SmallSet<SDValue, 8> Values; 4142 for (unsigned i = 0; i < NumElems; ++i) { 4143 SDValue Elt = Op.getOperand(i); 4144 if (Elt.getOpcode() == ISD::UNDEF) 4145 continue; 4146 Values.insert(Elt); 4147 if (Elt.getOpcode() != ISD::Constant && 4148 Elt.getOpcode() != ISD::ConstantFP) 4149 IsAllConstants = false; 4150 if (X86::isZeroNode(Elt)) 4151 NumZero++; 4152 else { 4153 NonZeros |= (1 << i); 4154 NumNonZero++; 4155 } 4156 } 4157 4158 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4159 if (NumNonZero == 0) 4160 return DAG.getUNDEF(VT); 4161 4162 // Special case for single non-zero, non-undef, element. 4163 if (NumNonZero == 1) { 4164 unsigned Idx = CountTrailingZeros_32(NonZeros); 4165 SDValue Item = Op.getOperand(Idx); 4166 4167 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4168 // the value are obviously zero, truncate the value to i32 and do the 4169 // insertion that way. Only do this if the value is non-constant or if the 4170 // value is a constant being inserted into element 0. It is cheaper to do 4171 // a constant pool load than it is to do a movd + shuffle. 4172 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4173 (!IsAllConstants || Idx == 0)) { 4174 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4175 // Handle SSE only. 4176 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4177 EVT VecVT = MVT::v4i32; 4178 unsigned VecElts = 4; 4179 4180 // Truncate the value (which may itself be a constant) to i32, and 4181 // convert it to a vector with movd (S2V+shuffle to zero extend). 4182 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4183 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4184 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4185 Subtarget->hasSSE2(), DAG); 4186 4187 // Now we have our 32-bit value zero extended in the low element of 4188 // a vector. If Idx != 0, swizzle it into place. 4189 if (Idx != 0) { 4190 SmallVector<int, 4> Mask; 4191 Mask.push_back(Idx); 4192 for (unsigned i = 1; i != VecElts; ++i) 4193 Mask.push_back(i); 4194 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4195 DAG.getUNDEF(Item.getValueType()), 4196 &Mask[0]); 4197 } 4198 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 4199 } 4200 } 4201 4202 // If we have a constant or non-constant insertion into the low element of 4203 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4204 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4205 // depending on what the source datatype is. 4206 if (Idx == 0) { 4207 if (NumZero == 0) { 4208 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4209 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4210 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4211 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4212 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4213 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4214 DAG); 4215 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4216 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4217 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4218 EVT MiddleVT = MVT::v4i32; 4219 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4220 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4221 Subtarget->hasSSE2(), DAG); 4222 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 4223 } 4224 } 4225 4226 // Is it a vector logical left shift? 4227 if (NumElems == 2 && Idx == 1 && 4228 X86::isZeroNode(Op.getOperand(0)) && 4229 !X86::isZeroNode(Op.getOperand(1))) { 4230 unsigned NumBits = VT.getSizeInBits(); 4231 return getVShift(true, VT, 4232 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4233 VT, Op.getOperand(1)), 4234 NumBits/2, DAG, *this, dl); 4235 } 4236 4237 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4238 return SDValue(); 4239 4240 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4241 // is a non-constant being inserted into an element other than the low one, 4242 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4243 // movd/movss) to move this into the low element, then shuffle it into 4244 // place. 4245 if (EVTBits == 32) { 4246 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4247 4248 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4249 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4250 Subtarget->hasSSE2(), DAG); 4251 SmallVector<int, 8> MaskVec; 4252 for (unsigned i = 0; i < NumElems; i++) 4253 MaskVec.push_back(i == Idx ? 0 : 1); 4254 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4255 } 4256 } 4257 4258 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4259 if (Values.size() == 1) { 4260 if (EVTBits == 32) { 4261 // Instead of a shuffle like this: 4262 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4263 // Check if it's possible to issue this instead. 4264 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4265 unsigned Idx = CountTrailingZeros_32(NonZeros); 4266 SDValue Item = Op.getOperand(Idx); 4267 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4268 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4269 } 4270 return SDValue(); 4271 } 4272 4273 // A vector full of immediates; various special cases are already 4274 // handled, so this is best done with a single constant-pool load. 4275 if (IsAllConstants) 4276 return SDValue(); 4277 4278 // Let legalizer expand 2-wide build_vectors. 4279 if (EVTBits == 64) { 4280 if (NumNonZero == 1) { 4281 // One half is zero or undef. 4282 unsigned Idx = CountTrailingZeros_32(NonZeros); 4283 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4284 Op.getOperand(Idx)); 4285 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4286 Subtarget->hasSSE2(), DAG); 4287 } 4288 return SDValue(); 4289 } 4290 4291 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4292 if (EVTBits == 8 && NumElems == 16) { 4293 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4294 *this); 4295 if (V.getNode()) return V; 4296 } 4297 4298 if (EVTBits == 16 && NumElems == 8) { 4299 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4300 *this); 4301 if (V.getNode()) return V; 4302 } 4303 4304 // If element VT is == 32 bits, turn it into a number of shuffles. 4305 SmallVector<SDValue, 8> V; 4306 V.resize(NumElems); 4307 if (NumElems == 4 && NumZero > 0) { 4308 for (unsigned i = 0; i < 4; ++i) { 4309 bool isZero = !(NonZeros & (1 << i)); 4310 if (isZero) 4311 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4312 else 4313 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4314 } 4315 4316 for (unsigned i = 0; i < 2; ++i) { 4317 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4318 default: break; 4319 case 0: 4320 V[i] = V[i*2]; // Must be a zero vector. 4321 break; 4322 case 1: 4323 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4324 break; 4325 case 2: 4326 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4327 break; 4328 case 3: 4329 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4330 break; 4331 } 4332 } 4333 4334 SmallVector<int, 8> MaskVec; 4335 bool Reverse = (NonZeros & 0x3) == 2; 4336 for (unsigned i = 0; i < 2; ++i) 4337 MaskVec.push_back(Reverse ? 1-i : i); 4338 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4339 for (unsigned i = 0; i < 2; ++i) 4340 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4341 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4342 } 4343 4344 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4345 // Check for a build vector of consecutive loads. 4346 for (unsigned i = 0; i < NumElems; ++i) 4347 V[i] = Op.getOperand(i); 4348 4349 // Check for elements which are consecutive loads. 4350 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4351 if (LD.getNode()) 4352 return LD; 4353 4354 // For SSE 4.1, use insertps to put the high elements into the low element. 4355 if (getSubtarget()->hasSSE41()) { 4356 SDValue Result; 4357 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4358 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4359 else 4360 Result = DAG.getUNDEF(VT); 4361 4362 for (unsigned i = 1; i < NumElems; ++i) { 4363 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4364 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4365 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4366 } 4367 return Result; 4368 } 4369 4370 // Otherwise, expand into a number of unpckl*, start by extending each of 4371 // our (non-undef) elements to the full vector width with the element in the 4372 // bottom slot of the vector (which generates no code for SSE). 4373 for (unsigned i = 0; i < NumElems; ++i) { 4374 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4375 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4376 else 4377 V[i] = DAG.getUNDEF(VT); 4378 } 4379 4380 // Next, we iteratively mix elements, e.g. for v4f32: 4381 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4382 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4383 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4384 unsigned EltStride = NumElems >> 1; 4385 while (EltStride != 0) { 4386 for (unsigned i = 0; i < EltStride; ++i) { 4387 // If V[i+EltStride] is undef and this is the first round of mixing, 4388 // then it is safe to just drop this shuffle: V[i] is already in the 4389 // right place, the one element (since it's the first round) being 4390 // inserted as undef can be dropped. This isn't safe for successive 4391 // rounds because they will permute elements within both vectors. 4392 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 4393 EltStride == NumElems/2) 4394 continue; 4395 4396 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 4397 } 4398 EltStride >>= 1; 4399 } 4400 return V[0]; 4401 } 4402 return SDValue(); 4403} 4404 4405SDValue 4406X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4407 // We support concatenate two MMX registers and place them in a MMX 4408 // register. This is better than doing a stack convert. 4409 DebugLoc dl = Op.getDebugLoc(); 4410 EVT ResVT = Op.getValueType(); 4411 assert(Op.getNumOperands() == 2); 4412 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4413 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4414 int Mask[2]; 4415 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 4416 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4417 InVec = Op.getOperand(1); 4418 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4419 unsigned NumElts = ResVT.getVectorNumElements(); 4420 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4421 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4422 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4423 } else { 4424 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 4425 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4426 Mask[0] = 0; Mask[1] = 2; 4427 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4428 } 4429 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4430} 4431 4432// v8i16 shuffles - Prefer shuffles in the following order: 4433// 1. [all] pshuflw, pshufhw, optional move 4434// 2. [ssse3] 1 x pshufb 4435// 3. [ssse3] 2 x pshufb + 1 x por 4436// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4437SDValue 4438X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4439 SelectionDAG &DAG) const { 4440 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4441 SDValue V1 = SVOp->getOperand(0); 4442 SDValue V2 = SVOp->getOperand(1); 4443 DebugLoc dl = SVOp->getDebugLoc(); 4444 SmallVector<int, 8> MaskVals; 4445 4446 // Determine if more than 1 of the words in each of the low and high quadwords 4447 // of the result come from the same quadword of one of the two inputs. Undef 4448 // mask values count as coming from any quadword, for better codegen. 4449 SmallVector<unsigned, 4> LoQuad(4); 4450 SmallVector<unsigned, 4> HiQuad(4); 4451 BitVector InputQuads(4); 4452 for (unsigned i = 0; i < 8; ++i) { 4453 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4454 int EltIdx = SVOp->getMaskElt(i); 4455 MaskVals.push_back(EltIdx); 4456 if (EltIdx < 0) { 4457 ++Quad[0]; 4458 ++Quad[1]; 4459 ++Quad[2]; 4460 ++Quad[3]; 4461 continue; 4462 } 4463 ++Quad[EltIdx / 4]; 4464 InputQuads.set(EltIdx / 4); 4465 } 4466 4467 int BestLoQuad = -1; 4468 unsigned MaxQuad = 1; 4469 for (unsigned i = 0; i < 4; ++i) { 4470 if (LoQuad[i] > MaxQuad) { 4471 BestLoQuad = i; 4472 MaxQuad = LoQuad[i]; 4473 } 4474 } 4475 4476 int BestHiQuad = -1; 4477 MaxQuad = 1; 4478 for (unsigned i = 0; i < 4; ++i) { 4479 if (HiQuad[i] > MaxQuad) { 4480 BestHiQuad = i; 4481 MaxQuad = HiQuad[i]; 4482 } 4483 } 4484 4485 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4486 // of the two input vectors, shuffle them into one input vector so only a 4487 // single pshufb instruction is necessary. If There are more than 2 input 4488 // quads, disable the next transformation since it does not help SSSE3. 4489 bool V1Used = InputQuads[0] || InputQuads[1]; 4490 bool V2Used = InputQuads[2] || InputQuads[3]; 4491 if (Subtarget->hasSSSE3()) { 4492 if (InputQuads.count() == 2 && V1Used && V2Used) { 4493 BestLoQuad = InputQuads.find_first(); 4494 BestHiQuad = InputQuads.find_next(BestLoQuad); 4495 } 4496 if (InputQuads.count() > 2) { 4497 BestLoQuad = -1; 4498 BestHiQuad = -1; 4499 } 4500 } 4501 4502 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4503 // the shuffle mask. If a quad is scored as -1, that means that it contains 4504 // words from all 4 input quadwords. 4505 SDValue NewV; 4506 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4507 SmallVector<int, 8> MaskV; 4508 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4509 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4510 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4511 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 4512 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 4513 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 4514 4515 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4516 // source words for the shuffle, to aid later transformations. 4517 bool AllWordsInNewV = true; 4518 bool InOrder[2] = { true, true }; 4519 for (unsigned i = 0; i != 8; ++i) { 4520 int idx = MaskVals[i]; 4521 if (idx != (int)i) 4522 InOrder[i/4] = false; 4523 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4524 continue; 4525 AllWordsInNewV = false; 4526 break; 4527 } 4528 4529 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4530 if (AllWordsInNewV) { 4531 for (int i = 0; i != 8; ++i) { 4532 int idx = MaskVals[i]; 4533 if (idx < 0) 4534 continue; 4535 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4536 if ((idx != i) && idx < 4) 4537 pshufhw = false; 4538 if ((idx != i) && idx > 3) 4539 pshuflw = false; 4540 } 4541 V1 = NewV; 4542 V2Used = false; 4543 BestLoQuad = 0; 4544 BestHiQuad = 1; 4545 } 4546 4547 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4548 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4549 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4550 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4551 unsigned TargetMask = 0; 4552 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4553 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4554 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4555 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4556 V1 = NewV.getOperand(0); 4557 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 4558 } 4559 } 4560 4561 // If we have SSSE3, and all words of the result are from 1 input vector, 4562 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4563 // is present, fall back to case 4. 4564 if (Subtarget->hasSSSE3()) { 4565 SmallVector<SDValue,16> pshufbMask; 4566 4567 // If we have elements from both input vectors, set the high bit of the 4568 // shuffle mask element to zero out elements that come from V2 in the V1 4569 // mask, and elements that come from V1 in the V2 mask, so that the two 4570 // results can be OR'd together. 4571 bool TwoInputs = V1Used && V2Used; 4572 for (unsigned i = 0; i != 8; ++i) { 4573 int EltIdx = MaskVals[i] * 2; 4574 if (TwoInputs && (EltIdx >= 16)) { 4575 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4576 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4577 continue; 4578 } 4579 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4580 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4581 } 4582 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 4583 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4584 DAG.getNode(ISD::BUILD_VECTOR, dl, 4585 MVT::v16i8, &pshufbMask[0], 16)); 4586 if (!TwoInputs) 4587 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4588 4589 // Calculate the shuffle mask for the second input, shuffle it, and 4590 // OR it with the first shuffled input. 4591 pshufbMask.clear(); 4592 for (unsigned i = 0; i != 8; ++i) { 4593 int EltIdx = MaskVals[i] * 2; 4594 if (EltIdx < 16) { 4595 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4596 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4597 continue; 4598 } 4599 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4600 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4601 } 4602 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 4603 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4604 DAG.getNode(ISD::BUILD_VECTOR, dl, 4605 MVT::v16i8, &pshufbMask[0], 16)); 4606 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4607 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4608 } 4609 4610 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4611 // and update MaskVals with new element order. 4612 BitVector InOrder(8); 4613 if (BestLoQuad >= 0) { 4614 SmallVector<int, 8> MaskV; 4615 for (int i = 0; i != 4; ++i) { 4616 int idx = MaskVals[i]; 4617 if (idx < 0) { 4618 MaskV.push_back(-1); 4619 InOrder.set(i); 4620 } else if ((idx / 4) == BestLoQuad) { 4621 MaskV.push_back(idx & 3); 4622 InOrder.set(i); 4623 } else { 4624 MaskV.push_back(-1); 4625 } 4626 } 4627 for (unsigned i = 4; i != 8; ++i) 4628 MaskV.push_back(i); 4629 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4630 &MaskV[0]); 4631 4632 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4633 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 4634 NewV.getOperand(0), 4635 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 4636 DAG); 4637 } 4638 4639 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4640 // and update MaskVals with the new element order. 4641 if (BestHiQuad >= 0) { 4642 SmallVector<int, 8> MaskV; 4643 for (unsigned i = 0; i != 4; ++i) 4644 MaskV.push_back(i); 4645 for (unsigned i = 4; i != 8; ++i) { 4646 int idx = MaskVals[i]; 4647 if (idx < 0) { 4648 MaskV.push_back(-1); 4649 InOrder.set(i); 4650 } else if ((idx / 4) == BestHiQuad) { 4651 MaskV.push_back((idx & 3) + 4); 4652 InOrder.set(i); 4653 } else { 4654 MaskV.push_back(-1); 4655 } 4656 } 4657 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4658 &MaskV[0]); 4659 4660 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4661 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 4662 NewV.getOperand(0), 4663 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 4664 DAG); 4665 } 4666 4667 // In case BestHi & BestLo were both -1, which means each quadword has a word 4668 // from each of the four input quadwords, calculate the InOrder bitvector now 4669 // before falling through to the insert/extract cleanup. 4670 if (BestLoQuad == -1 && BestHiQuad == -1) { 4671 NewV = V1; 4672 for (int i = 0; i != 8; ++i) 4673 if (MaskVals[i] < 0 || MaskVals[i] == i) 4674 InOrder.set(i); 4675 } 4676 4677 // The other elements are put in the right place using pextrw and pinsrw. 4678 for (unsigned i = 0; i != 8; ++i) { 4679 if (InOrder[i]) 4680 continue; 4681 int EltIdx = MaskVals[i]; 4682 if (EltIdx < 0) 4683 continue; 4684 SDValue ExtOp = (EltIdx < 8) 4685 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4686 DAG.getIntPtrConstant(EltIdx)) 4687 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4688 DAG.getIntPtrConstant(EltIdx - 8)); 4689 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4690 DAG.getIntPtrConstant(i)); 4691 } 4692 return NewV; 4693} 4694 4695// v16i8 shuffles - Prefer shuffles in the following order: 4696// 1. [ssse3] 1 x pshufb 4697// 2. [ssse3] 2 x pshufb + 1 x por 4698// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4699static 4700SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4701 SelectionDAG &DAG, 4702 const X86TargetLowering &TLI) { 4703 SDValue V1 = SVOp->getOperand(0); 4704 SDValue V2 = SVOp->getOperand(1); 4705 DebugLoc dl = SVOp->getDebugLoc(); 4706 SmallVector<int, 16> MaskVals; 4707 SVOp->getMask(MaskVals); 4708 4709 // If we have SSSE3, case 1 is generated when all result bytes come from 4710 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4711 // present, fall back to case 3. 4712 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4713 bool V1Only = true; 4714 bool V2Only = true; 4715 for (unsigned i = 0; i < 16; ++i) { 4716 int EltIdx = MaskVals[i]; 4717 if (EltIdx < 0) 4718 continue; 4719 if (EltIdx < 16) 4720 V2Only = false; 4721 else 4722 V1Only = false; 4723 } 4724 4725 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4726 if (TLI.getSubtarget()->hasSSSE3()) { 4727 SmallVector<SDValue,16> pshufbMask; 4728 4729 // If all result elements are from one input vector, then only translate 4730 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4731 // 4732 // Otherwise, we have elements from both input vectors, and must zero out 4733 // elements that come from V2 in the first mask, and V1 in the second mask 4734 // so that we can OR them together. 4735 bool TwoInputs = !(V1Only || V2Only); 4736 for (unsigned i = 0; i != 16; ++i) { 4737 int EltIdx = MaskVals[i]; 4738 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4739 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4740 continue; 4741 } 4742 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4743 } 4744 // If all the elements are from V2, assign it to V1 and return after 4745 // building the first pshufb. 4746 if (V2Only) 4747 V1 = V2; 4748 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4749 DAG.getNode(ISD::BUILD_VECTOR, dl, 4750 MVT::v16i8, &pshufbMask[0], 16)); 4751 if (!TwoInputs) 4752 return V1; 4753 4754 // Calculate the shuffle mask for the second input, shuffle it, and 4755 // OR it with the first shuffled input. 4756 pshufbMask.clear(); 4757 for (unsigned i = 0; i != 16; ++i) { 4758 int EltIdx = MaskVals[i]; 4759 if (EltIdx < 16) { 4760 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4761 continue; 4762 } 4763 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4764 } 4765 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4766 DAG.getNode(ISD::BUILD_VECTOR, dl, 4767 MVT::v16i8, &pshufbMask[0], 16)); 4768 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4769 } 4770 4771 // No SSSE3 - Calculate in place words and then fix all out of place words 4772 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4773 // the 16 different words that comprise the two doublequadword input vectors. 4774 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 4775 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 4776 SDValue NewV = V2Only ? V2 : V1; 4777 for (int i = 0; i != 8; ++i) { 4778 int Elt0 = MaskVals[i*2]; 4779 int Elt1 = MaskVals[i*2+1]; 4780 4781 // This word of the result is all undef, skip it. 4782 if (Elt0 < 0 && Elt1 < 0) 4783 continue; 4784 4785 // This word of the result is already in the correct place, skip it. 4786 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4787 continue; 4788 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4789 continue; 4790 4791 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4792 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4793 SDValue InsElt; 4794 4795 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4796 // using a single extract together, load it and store it. 4797 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4798 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4799 DAG.getIntPtrConstant(Elt1 / 2)); 4800 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4801 DAG.getIntPtrConstant(i)); 4802 continue; 4803 } 4804 4805 // If Elt1 is defined, extract it from the appropriate source. If the 4806 // source byte is not also odd, shift the extracted word left 8 bits 4807 // otherwise clear the bottom 8 bits if we need to do an or. 4808 if (Elt1 >= 0) { 4809 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4810 DAG.getIntPtrConstant(Elt1 / 2)); 4811 if ((Elt1 & 1) == 0) 4812 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4813 DAG.getConstant(8, TLI.getShiftAmountTy())); 4814 else if (Elt0 >= 0) 4815 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4816 DAG.getConstant(0xFF00, MVT::i16)); 4817 } 4818 // If Elt0 is defined, extract it from the appropriate source. If the 4819 // source byte is not also even, shift the extracted word right 8 bits. If 4820 // Elt1 was also defined, OR the extracted values together before 4821 // inserting them in the result. 4822 if (Elt0 >= 0) { 4823 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4824 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4825 if ((Elt0 & 1) != 0) 4826 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4827 DAG.getConstant(8, TLI.getShiftAmountTy())); 4828 else if (Elt1 >= 0) 4829 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4830 DAG.getConstant(0x00FF, MVT::i16)); 4831 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4832 : InsElt0; 4833 } 4834 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4835 DAG.getIntPtrConstant(i)); 4836 } 4837 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 4838} 4839 4840/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4841/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 4842/// done when every pair / quad of shuffle mask elements point to elements in 4843/// the right sequence. e.g. 4844/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 4845static 4846SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4847 SelectionDAG &DAG, DebugLoc dl) { 4848 EVT VT = SVOp->getValueType(0); 4849 SDValue V1 = SVOp->getOperand(0); 4850 SDValue V2 = SVOp->getOperand(1); 4851 unsigned NumElems = VT.getVectorNumElements(); 4852 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4853 EVT NewVT; 4854 switch (VT.getSimpleVT().SimpleTy) { 4855 default: assert(false && "Unexpected!"); 4856 case MVT::v4f32: NewVT = MVT::v2f64; break; 4857 case MVT::v4i32: NewVT = MVT::v2i64; break; 4858 case MVT::v8i16: NewVT = MVT::v4i32; break; 4859 case MVT::v16i8: NewVT = MVT::v4i32; break; 4860 } 4861 4862 int Scale = NumElems / NewWidth; 4863 SmallVector<int, 8> MaskVec; 4864 for (unsigned i = 0; i < NumElems; i += Scale) { 4865 int StartIdx = -1; 4866 for (int j = 0; j < Scale; ++j) { 4867 int EltIdx = SVOp->getMaskElt(i+j); 4868 if (EltIdx < 0) 4869 continue; 4870 if (StartIdx == -1) 4871 StartIdx = EltIdx - (EltIdx % Scale); 4872 if (EltIdx != StartIdx + j) 4873 return SDValue(); 4874 } 4875 if (StartIdx == -1) 4876 MaskVec.push_back(-1); 4877 else 4878 MaskVec.push_back(StartIdx / Scale); 4879 } 4880 4881 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 4882 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 4883 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4884} 4885 4886/// getVZextMovL - Return a zero-extending vector move low node. 4887/// 4888static SDValue getVZextMovL(EVT VT, EVT OpVT, 4889 SDValue SrcOp, SelectionDAG &DAG, 4890 const X86Subtarget *Subtarget, DebugLoc dl) { 4891 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4892 LoadSDNode *LD = NULL; 4893 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4894 LD = dyn_cast<LoadSDNode>(SrcOp); 4895 if (!LD) { 4896 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4897 // instead. 4898 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4899 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 4900 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4901 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 4902 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4903 // PR2108 4904 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4905 return DAG.getNode(ISD::BITCAST, dl, VT, 4906 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4907 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4908 OpVT, 4909 SrcOp.getOperand(0) 4910 .getOperand(0)))); 4911 } 4912 } 4913 } 4914 4915 return DAG.getNode(ISD::BITCAST, dl, VT, 4916 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4917 DAG.getNode(ISD::BITCAST, dl, 4918 OpVT, SrcOp))); 4919} 4920 4921/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4922/// shuffles. 4923static SDValue 4924LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4925 SDValue V1 = SVOp->getOperand(0); 4926 SDValue V2 = SVOp->getOperand(1); 4927 DebugLoc dl = SVOp->getDebugLoc(); 4928 EVT VT = SVOp->getValueType(0); 4929 4930 SmallVector<std::pair<int, int>, 8> Locs; 4931 Locs.resize(4); 4932 SmallVector<int, 8> Mask1(4U, -1); 4933 SmallVector<int, 8> PermMask; 4934 SVOp->getMask(PermMask); 4935 4936 unsigned NumHi = 0; 4937 unsigned NumLo = 0; 4938 for (unsigned i = 0; i != 4; ++i) { 4939 int Idx = PermMask[i]; 4940 if (Idx < 0) { 4941 Locs[i] = std::make_pair(-1, -1); 4942 } else { 4943 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4944 if (Idx < 4) { 4945 Locs[i] = std::make_pair(0, NumLo); 4946 Mask1[NumLo] = Idx; 4947 NumLo++; 4948 } else { 4949 Locs[i] = std::make_pair(1, NumHi); 4950 if (2+NumHi < 4) 4951 Mask1[2+NumHi] = Idx; 4952 NumHi++; 4953 } 4954 } 4955 } 4956 4957 if (NumLo <= 2 && NumHi <= 2) { 4958 // If no more than two elements come from either vector. This can be 4959 // implemented with two shuffles. First shuffle gather the elements. 4960 // The second shuffle, which takes the first shuffle as both of its 4961 // vector operands, put the elements into the right order. 4962 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4963 4964 SmallVector<int, 8> Mask2(4U, -1); 4965 4966 for (unsigned i = 0; i != 4; ++i) { 4967 if (Locs[i].first == -1) 4968 continue; 4969 else { 4970 unsigned Idx = (i < 2) ? 0 : 4; 4971 Idx += Locs[i].first * 2 + Locs[i].second; 4972 Mask2[i] = Idx; 4973 } 4974 } 4975 4976 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4977 } else if (NumLo == 3 || NumHi == 3) { 4978 // Otherwise, we must have three elements from one vector, call it X, and 4979 // one element from the other, call it Y. First, use a shufps to build an 4980 // intermediate vector with the one element from Y and the element from X 4981 // that will be in the same half in the final destination (the indexes don't 4982 // matter). Then, use a shufps to build the final vector, taking the half 4983 // containing the element from Y from the intermediate, and the other half 4984 // from X. 4985 if (NumHi == 3) { 4986 // Normalize it so the 3 elements come from V1. 4987 CommuteVectorShuffleMask(PermMask, VT); 4988 std::swap(V1, V2); 4989 } 4990 4991 // Find the element from V2. 4992 unsigned HiIndex; 4993 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4994 int Val = PermMask[HiIndex]; 4995 if (Val < 0) 4996 continue; 4997 if (Val >= 4) 4998 break; 4999 } 5000 5001 Mask1[0] = PermMask[HiIndex]; 5002 Mask1[1] = -1; 5003 Mask1[2] = PermMask[HiIndex^1]; 5004 Mask1[3] = -1; 5005 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5006 5007 if (HiIndex >= 2) { 5008 Mask1[0] = PermMask[0]; 5009 Mask1[1] = PermMask[1]; 5010 Mask1[2] = HiIndex & 1 ? 6 : 4; 5011 Mask1[3] = HiIndex & 1 ? 4 : 6; 5012 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5013 } else { 5014 Mask1[0] = HiIndex & 1 ? 2 : 0; 5015 Mask1[1] = HiIndex & 1 ? 0 : 2; 5016 Mask1[2] = PermMask[2]; 5017 Mask1[3] = PermMask[3]; 5018 if (Mask1[2] >= 0) 5019 Mask1[2] += 4; 5020 if (Mask1[3] >= 0) 5021 Mask1[3] += 4; 5022 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5023 } 5024 } 5025 5026 // Break it into (shuffle shuffle_hi, shuffle_lo). 5027 Locs.clear(); 5028 SmallVector<int,8> LoMask(4U, -1); 5029 SmallVector<int,8> HiMask(4U, -1); 5030 5031 SmallVector<int,8> *MaskPtr = &LoMask; 5032 unsigned MaskIdx = 0; 5033 unsigned LoIdx = 0; 5034 unsigned HiIdx = 2; 5035 for (unsigned i = 0; i != 4; ++i) { 5036 if (i == 2) { 5037 MaskPtr = &HiMask; 5038 MaskIdx = 1; 5039 LoIdx = 0; 5040 HiIdx = 2; 5041 } 5042 int Idx = PermMask[i]; 5043 if (Idx < 0) { 5044 Locs[i] = std::make_pair(-1, -1); 5045 } else if (Idx < 4) { 5046 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5047 (*MaskPtr)[LoIdx] = Idx; 5048 LoIdx++; 5049 } else { 5050 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5051 (*MaskPtr)[HiIdx] = Idx; 5052 HiIdx++; 5053 } 5054 } 5055 5056 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5057 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5058 SmallVector<int, 8> MaskOps; 5059 for (unsigned i = 0; i != 4; ++i) { 5060 if (Locs[i].first == -1) { 5061 MaskOps.push_back(-1); 5062 } else { 5063 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5064 MaskOps.push_back(Idx); 5065 } 5066 } 5067 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5068} 5069 5070static bool MayFoldVectorLoad(SDValue V) { 5071 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5072 V = V.getOperand(0); 5073 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5074 V = V.getOperand(0); 5075 if (MayFoldLoad(V)) 5076 return true; 5077 return false; 5078} 5079 5080// FIXME: the version above should always be used. Since there's 5081// a bug where several vector shuffles can't be folded because the 5082// DAG is not updated during lowering and a node claims to have two 5083// uses while it only has one, use this version, and let isel match 5084// another instruction if the load really happens to have more than 5085// one use. Remove this version after this bug get fixed. 5086// rdar://8434668, PR8156 5087static bool RelaxedMayFoldVectorLoad(SDValue V) { 5088 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5089 V = V.getOperand(0); 5090 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5091 V = V.getOperand(0); 5092 if (ISD::isNormalLoad(V.getNode())) 5093 return true; 5094 return false; 5095} 5096 5097/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5098/// a vector extract, and if both can be later optimized into a single load. 5099/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5100/// here because otherwise a target specific shuffle node is going to be 5101/// emitted for this shuffle, and the optimization not done. 5102/// FIXME: This is probably not the best approach, but fix the problem 5103/// until the right path is decided. 5104static 5105bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5106 const TargetLowering &TLI) { 5107 EVT VT = V.getValueType(); 5108 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5109 5110 // Be sure that the vector shuffle is present in a pattern like this: 5111 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5112 if (!V.hasOneUse()) 5113 return false; 5114 5115 SDNode *N = *V.getNode()->use_begin(); 5116 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5117 return false; 5118 5119 SDValue EltNo = N->getOperand(1); 5120 if (!isa<ConstantSDNode>(EltNo)) 5121 return false; 5122 5123 // If the bit convert changed the number of elements, it is unsafe 5124 // to examine the mask. 5125 bool HasShuffleIntoBitcast = false; 5126 if (V.getOpcode() == ISD::BITCAST) { 5127 EVT SrcVT = V.getOperand(0).getValueType(); 5128 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5129 return false; 5130 V = V.getOperand(0); 5131 HasShuffleIntoBitcast = true; 5132 } 5133 5134 // Select the input vector, guarding against out of range extract vector. 5135 unsigned NumElems = VT.getVectorNumElements(); 5136 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5137 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5138 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5139 5140 // Skip one more bit_convert if necessary 5141 if (V.getOpcode() == ISD::BITCAST) 5142 V = V.getOperand(0); 5143 5144 if (ISD::isNormalLoad(V.getNode())) { 5145 // Is the original load suitable? 5146 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5147 5148 // FIXME: avoid the multi-use bug that is preventing lots of 5149 // of foldings to be detected, this is still wrong of course, but 5150 // give the temporary desired behavior, and if it happens that 5151 // the load has real more uses, during isel it will not fold, and 5152 // will generate poor code. 5153 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5154 return false; 5155 5156 if (!HasShuffleIntoBitcast) 5157 return true; 5158 5159 // If there's a bitcast before the shuffle, check if the load type and 5160 // alignment is valid. 5161 unsigned Align = LN0->getAlignment(); 5162 unsigned NewAlign = 5163 TLI.getTargetData()->getABITypeAlignment( 5164 VT.getTypeForEVT(*DAG.getContext())); 5165 5166 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5167 return false; 5168 } 5169 5170 return true; 5171} 5172 5173static 5174SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 5175 EVT VT = Op.getValueType(); 5176 5177 // Canonizalize to v2f64. 5178 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 5179 return DAG.getNode(ISD::BITCAST, dl, VT, 5180 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 5181 V1, DAG)); 5182} 5183 5184static 5185SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5186 bool HasSSE2) { 5187 SDValue V1 = Op.getOperand(0); 5188 SDValue V2 = Op.getOperand(1); 5189 EVT VT = Op.getValueType(); 5190 5191 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5192 5193 if (HasSSE2 && VT == MVT::v2f64) 5194 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5195 5196 // v4f32 or v4i32 5197 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5198} 5199 5200static 5201SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5202 SDValue V1 = Op.getOperand(0); 5203 SDValue V2 = Op.getOperand(1); 5204 EVT VT = Op.getValueType(); 5205 5206 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5207 "unsupported shuffle type"); 5208 5209 if (V2.getOpcode() == ISD::UNDEF) 5210 V2 = V1; 5211 5212 // v4i32 or v4f32 5213 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5214} 5215 5216static 5217SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5218 SDValue V1 = Op.getOperand(0); 5219 SDValue V2 = Op.getOperand(1); 5220 EVT VT = Op.getValueType(); 5221 unsigned NumElems = VT.getVectorNumElements(); 5222 5223 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5224 // operand of these instructions is only memory, so check if there's a 5225 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5226 // same masks. 5227 bool CanFoldLoad = false; 5228 5229 // Trivial case, when V2 comes from a load. 5230 if (MayFoldVectorLoad(V2)) 5231 CanFoldLoad = true; 5232 5233 // When V1 is a load, it can be folded later into a store in isel, example: 5234 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5235 // turns into: 5236 // (MOVLPSmr addr:$src1, VR128:$src2) 5237 // So, recognize this potential and also use MOVLPS or MOVLPD 5238 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 5239 CanFoldLoad = true; 5240 5241 if (CanFoldLoad) { 5242 if (HasSSE2 && NumElems == 2) 5243 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5244 5245 if (NumElems == 4) 5246 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5247 } 5248 5249 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5250 // movl and movlp will both match v2i64, but v2i64 is never matched by 5251 // movl earlier because we make it strict to avoid messing with the movlp load 5252 // folding logic (see the code above getMOVLP call). Match it here then, 5253 // this is horrible, but will stay like this until we move all shuffle 5254 // matching to x86 specific nodes. Note that for the 1st condition all 5255 // types are matched with movsd. 5256 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5257 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5258 else if (HasSSE2) 5259 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5260 5261 5262 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5263 5264 // Invert the operand order and use SHUFPS to match it. 5265 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5266 X86::getShuffleSHUFImmediate(SVOp), DAG); 5267} 5268 5269static inline unsigned getUNPCKLOpcode(EVT VT) { 5270 switch(VT.getSimpleVT().SimpleTy) { 5271 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 5272 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 5273 case MVT::v4f32: return X86ISD::UNPCKLPS; 5274 case MVT::v2f64: return X86ISD::UNPCKLPD; 5275 case MVT::v16i8: return X86ISD::PUNPCKLBW; 5276 case MVT::v8i16: return X86ISD::PUNPCKLWD; 5277 default: 5278 llvm_unreachable("Unknow type for unpckl"); 5279 } 5280 return 0; 5281} 5282 5283static inline unsigned getUNPCKHOpcode(EVT VT) { 5284 switch(VT.getSimpleVT().SimpleTy) { 5285 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 5286 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 5287 case MVT::v4f32: return X86ISD::UNPCKHPS; 5288 case MVT::v2f64: return X86ISD::UNPCKHPD; 5289 case MVT::v16i8: return X86ISD::PUNPCKHBW; 5290 case MVT::v8i16: return X86ISD::PUNPCKHWD; 5291 default: 5292 llvm_unreachable("Unknow type for unpckh"); 5293 } 5294 return 0; 5295} 5296 5297static 5298SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 5299 const TargetLowering &TLI, 5300 const X86Subtarget *Subtarget) { 5301 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5302 EVT VT = Op.getValueType(); 5303 DebugLoc dl = Op.getDebugLoc(); 5304 SDValue V1 = Op.getOperand(0); 5305 SDValue V2 = Op.getOperand(1); 5306 5307 if (isZeroShuffle(SVOp)) 5308 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5309 5310 // Handle splat operations 5311 if (SVOp->isSplat()) { 5312 // Special case, this is the only place now where it's 5313 // allowed to return a vector_shuffle operation without 5314 // using a target specific node, because *hopefully* it 5315 // will be optimized away by the dag combiner. 5316 if (VT.getVectorNumElements() <= 4 && 5317 CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 5318 return Op; 5319 5320 // Handle splats by matching through known masks 5321 if (VT.getVectorNumElements() <= 4) 5322 return SDValue(); 5323 5324 // Canonicalize all of the remaining to v4f32. 5325 return PromoteSplat(SVOp, DAG); 5326 } 5327 5328 // If the shuffle can be profitably rewritten as a narrower shuffle, then 5329 // do it! 5330 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 5331 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5332 if (NewOp.getNode()) 5333 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 5334 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 5335 // FIXME: Figure out a cleaner way to do this. 5336 // Try to make use of movq to zero out the top part. 5337 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 5338 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5339 if (NewOp.getNode()) { 5340 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 5341 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 5342 DAG, Subtarget, dl); 5343 } 5344 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 5345 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5346 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 5347 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 5348 DAG, Subtarget, dl); 5349 } 5350 } 5351 return SDValue(); 5352} 5353 5354SDValue 5355X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 5356 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5357 SDValue V1 = Op.getOperand(0); 5358 SDValue V2 = Op.getOperand(1); 5359 EVT VT = Op.getValueType(); 5360 DebugLoc dl = Op.getDebugLoc(); 5361 unsigned NumElems = VT.getVectorNumElements(); 5362 bool isMMX = VT.getSizeInBits() == 64; 5363 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 5364 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5365 bool V1IsSplat = false; 5366 bool V2IsSplat = false; 5367 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 5368 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 5369 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 5370 MachineFunction &MF = DAG.getMachineFunction(); 5371 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 5372 5373 // Shuffle operations on MMX not supported. 5374 if (isMMX) 5375 return Op; 5376 5377 // Vector shuffle lowering takes 3 steps: 5378 // 5379 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 5380 // narrowing and commutation of operands should be handled. 5381 // 2) Matching of shuffles with known shuffle masks to x86 target specific 5382 // shuffle nodes. 5383 // 3) Rewriting of unmatched masks into new generic shuffle operations, 5384 // so the shuffle can be broken into other shuffles and the legalizer can 5385 // try the lowering again. 5386 // 5387 // The general ideia is that no vector_shuffle operation should be left to 5388 // be matched during isel, all of them must be converted to a target specific 5389 // node here. 5390 5391 // Normalize the input vectors. Here splats, zeroed vectors, profitable 5392 // narrowing and commutation of operands should be handled. The actual code 5393 // doesn't include all of those, work in progress... 5394 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 5395 if (NewOp.getNode()) 5396 return NewOp; 5397 5398 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 5399 // unpckh_undef). Only use pshufd if speed is more important than size. 5400 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 5401 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5402 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5403 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 5404 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5405 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5406 5407 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 5408 RelaxedMayFoldVectorLoad(V1)) 5409 return getMOVDDup(Op, dl, V1, DAG); 5410 5411 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 5412 return getMOVHighToLow(Op, dl, DAG); 5413 5414 // Use to match splats 5415 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 5416 (VT == MVT::v2f64 || VT == MVT::v2i64)) 5417 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5418 5419 if (X86::isPSHUFDMask(SVOp)) { 5420 // The actual implementation will match the mask in the if above and then 5421 // during isel it can match several different instructions, not only pshufd 5422 // as its name says, sad but true, emulate the behavior for now... 5423 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 5424 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 5425 5426 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5427 5428 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 5429 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 5430 5431 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5432 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 5433 TargetMask, DAG); 5434 5435 if (VT == MVT::v4f32) 5436 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 5437 TargetMask, DAG); 5438 } 5439 5440 // Check if this can be converted into a logical shift. 5441 bool isLeft = false; 5442 unsigned ShAmt = 0; 5443 SDValue ShVal; 5444 bool isShift = getSubtarget()->hasSSE2() && 5445 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 5446 if (isShift && ShVal.hasOneUse()) { 5447 // If the shifted value has multiple uses, it may be cheaper to use 5448 // v_set0 + movlhps or movhlps, etc. 5449 EVT EltVT = VT.getVectorElementType(); 5450 ShAmt *= EltVT.getSizeInBits(); 5451 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5452 } 5453 5454 if (X86::isMOVLMask(SVOp)) { 5455 if (V1IsUndef) 5456 return V2; 5457 if (ISD::isBuildVectorAllZeros(V1.getNode())) 5458 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 5459 if (!X86::isMOVLPMask(SVOp)) { 5460 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5461 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5462 5463 if (VT == MVT::v4i32 || VT == MVT::v4f32) 5464 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5465 } 5466 } 5467 5468 // FIXME: fold these into legal mask. 5469 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 5470 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 5471 5472 if (X86::isMOVHLPSMask(SVOp)) 5473 return getMOVHighToLow(Op, dl, DAG); 5474 5475 if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5476 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 5477 5478 if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5479 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 5480 5481 if (X86::isMOVLPMask(SVOp)) 5482 return getMOVLP(Op, dl, DAG, HasSSE2); 5483 5484 if (ShouldXformToMOVHLPS(SVOp) || 5485 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 5486 return CommuteVectorShuffle(SVOp, DAG); 5487 5488 if (isShift) { 5489 // No better options. Use a vshl / vsrl. 5490 EVT EltVT = VT.getVectorElementType(); 5491 ShAmt *= EltVT.getSizeInBits(); 5492 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5493 } 5494 5495 bool Commuted = false; 5496 // FIXME: This should also accept a bitcast of a splat? Be careful, not 5497 // 1,1,1,1 -> v8i16 though. 5498 V1IsSplat = isSplatVector(V1.getNode()); 5499 V2IsSplat = isSplatVector(V2.getNode()); 5500 5501 // Canonicalize the splat or undef, if present, to be on the RHS. 5502 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 5503 Op = CommuteVectorShuffle(SVOp, DAG); 5504 SVOp = cast<ShuffleVectorSDNode>(Op); 5505 V1 = SVOp->getOperand(0); 5506 V2 = SVOp->getOperand(1); 5507 std::swap(V1IsSplat, V2IsSplat); 5508 std::swap(V1IsUndef, V2IsUndef); 5509 Commuted = true; 5510 } 5511 5512 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 5513 // Shuffling low element of v1 into undef, just return v1. 5514 if (V2IsUndef) 5515 return V1; 5516 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 5517 // the instruction selector will not match, so get a canonical MOVL with 5518 // swapped operands to undo the commute. 5519 return getMOVL(DAG, dl, VT, V2, V1); 5520 } 5521 5522 if (X86::isUNPCKLMask(SVOp)) 5523 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); 5524 5525 if (X86::isUNPCKHMask(SVOp)) 5526 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 5527 5528 if (V2IsSplat) { 5529 // Normalize mask so all entries that point to V2 points to its first 5530 // element then try to match unpck{h|l} again. If match, return a 5531 // new vector_shuffle with the corrected mask. 5532 SDValue NewMask = NormalizeMask(SVOp, DAG); 5533 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 5534 if (NSVOp != SVOp) { 5535 if (X86::isUNPCKLMask(NSVOp, true)) { 5536 return NewMask; 5537 } else if (X86::isUNPCKHMask(NSVOp, true)) { 5538 return NewMask; 5539 } 5540 } 5541 } 5542 5543 if (Commuted) { 5544 // Commute is back and try unpck* again. 5545 // FIXME: this seems wrong. 5546 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 5547 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 5548 5549 if (X86::isUNPCKLMask(NewSVOp)) 5550 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); 5551 5552 if (X86::isUNPCKHMask(NewSVOp)) 5553 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 5554 } 5555 5556 // Normalize the node to match x86 shuffle ops if needed 5557 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 5558 return CommuteVectorShuffle(SVOp, DAG); 5559 5560 // The checks below are all present in isShuffleMaskLegal, but they are 5561 // inlined here right now to enable us to directly emit target specific 5562 // nodes, and remove one by one until they don't return Op anymore. 5563 SmallVector<int, 16> M; 5564 SVOp->getMask(M); 5565 5566 if (isPALIGNRMask(M, VT, HasSSSE3)) 5567 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 5568 X86::getShufflePALIGNRImmediate(SVOp), 5569 DAG); 5570 5571 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 5572 SVOp->getSplatIndex() == 0 && V2IsUndef) { 5573 if (VT == MVT::v2f64) 5574 return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); 5575 if (VT == MVT::v2i64) 5576 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 5577 } 5578 5579 if (isPSHUFHWMask(M, VT)) 5580 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 5581 X86::getShufflePSHUFHWImmediate(SVOp), 5582 DAG); 5583 5584 if (isPSHUFLWMask(M, VT)) 5585 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 5586 X86::getShufflePSHUFLWImmediate(SVOp), 5587 DAG); 5588 5589 if (isSHUFPMask(M, VT)) { 5590 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5591 if (VT == MVT::v4f32 || VT == MVT::v4i32) 5592 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 5593 TargetMask, DAG); 5594 if (VT == MVT::v2f64 || VT == MVT::v2i64) 5595 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 5596 TargetMask, DAG); 5597 } 5598 5599 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 5600 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5601 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5602 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 5603 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5604 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5605 5606 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 5607 if (VT == MVT::v8i16) { 5608 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 5609 if (NewOp.getNode()) 5610 return NewOp; 5611 } 5612 5613 if (VT == MVT::v16i8) { 5614 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 5615 if (NewOp.getNode()) 5616 return NewOp; 5617 } 5618 5619 // Handle all 4 wide cases with a number of shuffles. 5620 if (NumElems == 4) 5621 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 5622 5623 return SDValue(); 5624} 5625 5626SDValue 5627X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 5628 SelectionDAG &DAG) const { 5629 EVT VT = Op.getValueType(); 5630 DebugLoc dl = Op.getDebugLoc(); 5631 if (VT.getSizeInBits() == 8) { 5632 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 5633 Op.getOperand(0), Op.getOperand(1)); 5634 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5635 DAG.getValueType(VT)); 5636 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5637 } else if (VT.getSizeInBits() == 16) { 5638 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5639 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 5640 if (Idx == 0) 5641 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5642 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5643 DAG.getNode(ISD::BITCAST, dl, 5644 MVT::v4i32, 5645 Op.getOperand(0)), 5646 Op.getOperand(1))); 5647 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5648 Op.getOperand(0), Op.getOperand(1)); 5649 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5650 DAG.getValueType(VT)); 5651 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5652 } else if (VT == MVT::f32) { 5653 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5654 // the result back to FR32 register. It's only worth matching if the 5655 // result has a single use which is a store or a bitcast to i32. And in 5656 // the case of a store, it's not worth it if the index is a constant 0, 5657 // because a MOVSSmr can be used instead, which is smaller and faster. 5658 if (!Op.hasOneUse()) 5659 return SDValue(); 5660 SDNode *User = *Op.getNode()->use_begin(); 5661 if ((User->getOpcode() != ISD::STORE || 5662 (isa<ConstantSDNode>(Op.getOperand(1)) && 5663 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5664 (User->getOpcode() != ISD::BITCAST || 5665 User->getValueType(0) != MVT::i32)) 5666 return SDValue(); 5667 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5668 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 5669 Op.getOperand(0)), 5670 Op.getOperand(1)); 5671 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 5672 } else if (VT == MVT::i32) { 5673 // ExtractPS works with constant index. 5674 if (isa<ConstantSDNode>(Op.getOperand(1))) 5675 return Op; 5676 } 5677 return SDValue(); 5678} 5679 5680 5681SDValue 5682X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5683 SelectionDAG &DAG) const { 5684 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5685 return SDValue(); 5686 5687 if (Subtarget->hasSSE41()) { 5688 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5689 if (Res.getNode()) 5690 return Res; 5691 } 5692 5693 EVT VT = Op.getValueType(); 5694 DebugLoc dl = Op.getDebugLoc(); 5695 // TODO: handle v16i8. 5696 if (VT.getSizeInBits() == 16) { 5697 SDValue Vec = Op.getOperand(0); 5698 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5699 if (Idx == 0) 5700 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5701 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5702 DAG.getNode(ISD::BITCAST, dl, 5703 MVT::v4i32, Vec), 5704 Op.getOperand(1))); 5705 // Transform it so it match pextrw which produces a 32-bit result. 5706 EVT EltVT = MVT::i32; 5707 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5708 Op.getOperand(0), Op.getOperand(1)); 5709 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5710 DAG.getValueType(VT)); 5711 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5712 } else if (VT.getSizeInBits() == 32) { 5713 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5714 if (Idx == 0) 5715 return Op; 5716 5717 // SHUFPS the element to the lowest double word, then movss. 5718 int Mask[4] = { Idx, -1, -1, -1 }; 5719 EVT VVT = Op.getOperand(0).getValueType(); 5720 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5721 DAG.getUNDEF(VVT), Mask); 5722 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5723 DAG.getIntPtrConstant(0)); 5724 } else if (VT.getSizeInBits() == 64) { 5725 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5726 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5727 // to match extract_elt for f64. 5728 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5729 if (Idx == 0) 5730 return Op; 5731 5732 // UNPCKHPD the element to the lowest double word, then movsd. 5733 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5734 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5735 int Mask[2] = { 1, -1 }; 5736 EVT VVT = Op.getOperand(0).getValueType(); 5737 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5738 DAG.getUNDEF(VVT), Mask); 5739 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5740 DAG.getIntPtrConstant(0)); 5741 } 5742 5743 return SDValue(); 5744} 5745 5746SDValue 5747X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5748 SelectionDAG &DAG) const { 5749 EVT VT = Op.getValueType(); 5750 EVT EltVT = VT.getVectorElementType(); 5751 DebugLoc dl = Op.getDebugLoc(); 5752 5753 SDValue N0 = Op.getOperand(0); 5754 SDValue N1 = Op.getOperand(1); 5755 SDValue N2 = Op.getOperand(2); 5756 5757 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5758 isa<ConstantSDNode>(N2)) { 5759 unsigned Opc; 5760 if (VT == MVT::v8i16) 5761 Opc = X86ISD::PINSRW; 5762 else if (VT == MVT::v16i8) 5763 Opc = X86ISD::PINSRB; 5764 else 5765 Opc = X86ISD::PINSRB; 5766 5767 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5768 // argument. 5769 if (N1.getValueType() != MVT::i32) 5770 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5771 if (N2.getValueType() != MVT::i32) 5772 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5773 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5774 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5775 // Bits [7:6] of the constant are the source select. This will always be 5776 // zero here. The DAG Combiner may combine an extract_elt index into these 5777 // bits. For example (insert (extract, 3), 2) could be matched by putting 5778 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5779 // Bits [5:4] of the constant are the destination select. This is the 5780 // value of the incoming immediate. 5781 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5782 // combine either bitwise AND or insert of float 0.0 to set these bits. 5783 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5784 // Create this as a scalar to vector.. 5785 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5786 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5787 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5788 // PINSR* works with constant index. 5789 return Op; 5790 } 5791 return SDValue(); 5792} 5793 5794SDValue 5795X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5796 EVT VT = Op.getValueType(); 5797 EVT EltVT = VT.getVectorElementType(); 5798 5799 if (Subtarget->hasSSE41()) 5800 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5801 5802 if (EltVT == MVT::i8) 5803 return SDValue(); 5804 5805 DebugLoc dl = Op.getDebugLoc(); 5806 SDValue N0 = Op.getOperand(0); 5807 SDValue N1 = Op.getOperand(1); 5808 SDValue N2 = Op.getOperand(2); 5809 5810 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5811 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5812 // as its second argument. 5813 if (N1.getValueType() != MVT::i32) 5814 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5815 if (N2.getValueType() != MVT::i32) 5816 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5817 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 5818 } 5819 return SDValue(); 5820} 5821 5822SDValue 5823X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5824 DebugLoc dl = Op.getDebugLoc(); 5825 5826 if (Op.getValueType() == MVT::v1i64 && 5827 Op.getOperand(0).getValueType() == MVT::i64) 5828 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5829 5830 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5831 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 5832 "Expected an SSE type!"); 5833 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 5834 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 5835} 5836 5837// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5838// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5839// one of the above mentioned nodes. It has to be wrapped because otherwise 5840// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5841// be used to form addressing mode. These wrapped nodes will be selected 5842// into MOV32ri. 5843SDValue 5844X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5845 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5846 5847 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5848 // global base reg. 5849 unsigned char OpFlag = 0; 5850 unsigned WrapperKind = X86ISD::Wrapper; 5851 CodeModel::Model M = getTargetMachine().getCodeModel(); 5852 5853 if (Subtarget->isPICStyleRIPRel() && 5854 (M == CodeModel::Small || M == CodeModel::Kernel)) 5855 WrapperKind = X86ISD::WrapperRIP; 5856 else if (Subtarget->isPICStyleGOT()) 5857 OpFlag = X86II::MO_GOTOFF; 5858 else if (Subtarget->isPICStyleStubPIC()) 5859 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5860 5861 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5862 CP->getAlignment(), 5863 CP->getOffset(), OpFlag); 5864 DebugLoc DL = CP->getDebugLoc(); 5865 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5866 // With PIC, the address is actually $g + Offset. 5867 if (OpFlag) { 5868 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5869 DAG.getNode(X86ISD::GlobalBaseReg, 5870 DebugLoc(), getPointerTy()), 5871 Result); 5872 } 5873 5874 return Result; 5875} 5876 5877SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5878 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5879 5880 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5881 // global base reg. 5882 unsigned char OpFlag = 0; 5883 unsigned WrapperKind = X86ISD::Wrapper; 5884 CodeModel::Model M = getTargetMachine().getCodeModel(); 5885 5886 if (Subtarget->isPICStyleRIPRel() && 5887 (M == CodeModel::Small || M == CodeModel::Kernel)) 5888 WrapperKind = X86ISD::WrapperRIP; 5889 else if (Subtarget->isPICStyleGOT()) 5890 OpFlag = X86II::MO_GOTOFF; 5891 else if (Subtarget->isPICStyleStubPIC()) 5892 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5893 5894 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5895 OpFlag); 5896 DebugLoc DL = JT->getDebugLoc(); 5897 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5898 5899 // With PIC, the address is actually $g + Offset. 5900 if (OpFlag) 5901 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5902 DAG.getNode(X86ISD::GlobalBaseReg, 5903 DebugLoc(), getPointerTy()), 5904 Result); 5905 5906 return Result; 5907} 5908 5909SDValue 5910X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5911 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5912 5913 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5914 // global base reg. 5915 unsigned char OpFlag = 0; 5916 unsigned WrapperKind = X86ISD::Wrapper; 5917 CodeModel::Model M = getTargetMachine().getCodeModel(); 5918 5919 if (Subtarget->isPICStyleRIPRel() && 5920 (M == CodeModel::Small || M == CodeModel::Kernel)) 5921 WrapperKind = X86ISD::WrapperRIP; 5922 else if (Subtarget->isPICStyleGOT()) 5923 OpFlag = X86II::MO_GOTOFF; 5924 else if (Subtarget->isPICStyleStubPIC()) 5925 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5926 5927 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5928 5929 DebugLoc DL = Op.getDebugLoc(); 5930 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5931 5932 5933 // With PIC, the address is actually $g + Offset. 5934 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5935 !Subtarget->is64Bit()) { 5936 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5937 DAG.getNode(X86ISD::GlobalBaseReg, 5938 DebugLoc(), getPointerTy()), 5939 Result); 5940 } 5941 5942 return Result; 5943} 5944 5945SDValue 5946X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5947 // Create the TargetBlockAddressAddress node. 5948 unsigned char OpFlags = 5949 Subtarget->ClassifyBlockAddressReference(); 5950 CodeModel::Model M = getTargetMachine().getCodeModel(); 5951 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5952 DebugLoc dl = Op.getDebugLoc(); 5953 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5954 /*isTarget=*/true, OpFlags); 5955 5956 if (Subtarget->isPICStyleRIPRel() && 5957 (M == CodeModel::Small || M == CodeModel::Kernel)) 5958 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5959 else 5960 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5961 5962 // With PIC, the address is actually $g + Offset. 5963 if (isGlobalRelativeToPICBase(OpFlags)) { 5964 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5965 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5966 Result); 5967 } 5968 5969 return Result; 5970} 5971 5972SDValue 5973X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5974 int64_t Offset, 5975 SelectionDAG &DAG) const { 5976 // Create the TargetGlobalAddress node, folding in the constant 5977 // offset if it is legal. 5978 unsigned char OpFlags = 5979 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5980 CodeModel::Model M = getTargetMachine().getCodeModel(); 5981 SDValue Result; 5982 if (OpFlags == X86II::MO_NO_FLAG && 5983 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5984 // A direct static reference to a global. 5985 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 5986 Offset = 0; 5987 } else { 5988 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 5989 } 5990 5991 if (Subtarget->isPICStyleRIPRel() && 5992 (M == CodeModel::Small || M == CodeModel::Kernel)) 5993 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5994 else 5995 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5996 5997 // With PIC, the address is actually $g + Offset. 5998 if (isGlobalRelativeToPICBase(OpFlags)) { 5999 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6000 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6001 Result); 6002 } 6003 6004 // For globals that require a load from a stub to get the address, emit the 6005 // load. 6006 if (isGlobalStubReference(OpFlags)) 6007 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 6008 MachinePointerInfo::getGOT(), false, false, 0); 6009 6010 // If there was a non-zero offset that we didn't fold, create an explicit 6011 // addition for it. 6012 if (Offset != 0) 6013 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 6014 DAG.getConstant(Offset, getPointerTy())); 6015 6016 return Result; 6017} 6018 6019SDValue 6020X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 6021 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6022 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 6023 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 6024} 6025 6026static SDValue 6027GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 6028 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 6029 unsigned char OperandFlags) { 6030 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6031 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6032 DebugLoc dl = GA->getDebugLoc(); 6033 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6034 GA->getValueType(0), 6035 GA->getOffset(), 6036 OperandFlags); 6037 if (InFlag) { 6038 SDValue Ops[] = { Chain, TGA, *InFlag }; 6039 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 6040 } else { 6041 SDValue Ops[] = { Chain, TGA }; 6042 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 6043 } 6044 6045 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 6046 MFI->setAdjustsStack(true); 6047 6048 SDValue Flag = Chain.getValue(1); 6049 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 6050} 6051 6052// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 6053static SDValue 6054LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6055 const EVT PtrVT) { 6056 SDValue InFlag; 6057 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 6058 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 6059 DAG.getNode(X86ISD::GlobalBaseReg, 6060 DebugLoc(), PtrVT), InFlag); 6061 InFlag = Chain.getValue(1); 6062 6063 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 6064} 6065 6066// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 6067static SDValue 6068LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6069 const EVT PtrVT) { 6070 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 6071 X86::RAX, X86II::MO_TLSGD); 6072} 6073 6074// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 6075// "local exec" model. 6076static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6077 const EVT PtrVT, TLSModel::Model model, 6078 bool is64Bit) { 6079 DebugLoc dl = GA->getDebugLoc(); 6080 6081 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 6082 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 6083 is64Bit ? 257 : 256)); 6084 6085 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 6086 DAG.getIntPtrConstant(0), 6087 MachinePointerInfo(Ptr), false, false, 0); 6088 6089 unsigned char OperandFlags = 0; 6090 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 6091 // initialexec. 6092 unsigned WrapperKind = X86ISD::Wrapper; 6093 if (model == TLSModel::LocalExec) { 6094 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 6095 } else if (is64Bit) { 6096 assert(model == TLSModel::InitialExec); 6097 OperandFlags = X86II::MO_GOTTPOFF; 6098 WrapperKind = X86ISD::WrapperRIP; 6099 } else { 6100 assert(model == TLSModel::InitialExec); 6101 OperandFlags = X86II::MO_INDNTPOFF; 6102 } 6103 6104 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 6105 // exec) 6106 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6107 GA->getValueType(0), 6108 GA->getOffset(), OperandFlags); 6109 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 6110 6111 if (model == TLSModel::InitialExec) 6112 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 6113 MachinePointerInfo::getGOT(), false, false, 0); 6114 6115 // The address of the thread local variable is the add of the thread 6116 // pointer with the offset of the variable. 6117 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 6118} 6119 6120SDValue 6121X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 6122 6123 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6124 const GlobalValue *GV = GA->getGlobal(); 6125 6126 if (Subtarget->isTargetELF()) { 6127 // TODO: implement the "local dynamic" model 6128 // TODO: implement the "initial exec"model for pic executables 6129 6130 // If GV is an alias then use the aliasee for determining 6131 // thread-localness. 6132 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 6133 GV = GA->resolveAliasedGlobal(false); 6134 6135 TLSModel::Model model 6136 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 6137 6138 switch (model) { 6139 case TLSModel::GeneralDynamic: 6140 case TLSModel::LocalDynamic: // not implemented 6141 if (Subtarget->is64Bit()) 6142 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 6143 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 6144 6145 case TLSModel::InitialExec: 6146 case TLSModel::LocalExec: 6147 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 6148 Subtarget->is64Bit()); 6149 } 6150 } else if (Subtarget->isTargetDarwin()) { 6151 // Darwin only has one model of TLS. Lower to that. 6152 unsigned char OpFlag = 0; 6153 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 6154 X86ISD::WrapperRIP : X86ISD::Wrapper; 6155 6156 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6157 // global base reg. 6158 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 6159 !Subtarget->is64Bit(); 6160 if (PIC32) 6161 OpFlag = X86II::MO_TLVP_PIC_BASE; 6162 else 6163 OpFlag = X86II::MO_TLVP; 6164 DebugLoc DL = Op.getDebugLoc(); 6165 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 6166 GA->getValueType(0), 6167 GA->getOffset(), OpFlag); 6168 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6169 6170 // With PIC32, the address is actually $g + Offset. 6171 if (PIC32) 6172 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6173 DAG.getNode(X86ISD::GlobalBaseReg, 6174 DebugLoc(), getPointerTy()), 6175 Offset); 6176 6177 // Lowering the machine isd will make sure everything is in the right 6178 // location. 6179 SDValue Chain = DAG.getEntryNode(); 6180 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6181 SDValue Args[] = { Chain, Offset }; 6182 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 6183 6184 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 6185 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6186 MFI->setAdjustsStack(true); 6187 6188 // And our return value (tls address) is in the standard call return value 6189 // location. 6190 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 6191 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 6192 } 6193 6194 assert(false && 6195 "TLS not implemented for this target."); 6196 6197 llvm_unreachable("Unreachable"); 6198 return SDValue(); 6199} 6200 6201 6202/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 6203/// take a 2 x i32 value to shift plus a shift amount. 6204SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 6205 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6206 EVT VT = Op.getValueType(); 6207 unsigned VTBits = VT.getSizeInBits(); 6208 DebugLoc dl = Op.getDebugLoc(); 6209 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 6210 SDValue ShOpLo = Op.getOperand(0); 6211 SDValue ShOpHi = Op.getOperand(1); 6212 SDValue ShAmt = Op.getOperand(2); 6213 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 6214 DAG.getConstant(VTBits - 1, MVT::i8)) 6215 : DAG.getConstant(0, VT); 6216 6217 SDValue Tmp2, Tmp3; 6218 if (Op.getOpcode() == ISD::SHL_PARTS) { 6219 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 6220 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6221 } else { 6222 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 6223 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 6224 } 6225 6226 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 6227 DAG.getConstant(VTBits, MVT::i8)); 6228 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6229 AndNode, DAG.getConstant(0, MVT::i8)); 6230 6231 SDValue Hi, Lo; 6232 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6233 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 6234 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 6235 6236 if (Op.getOpcode() == ISD::SHL_PARTS) { 6237 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6238 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6239 } else { 6240 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6241 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6242 } 6243 6244 SDValue Ops[2] = { Lo, Hi }; 6245 return DAG.getMergeValues(Ops, 2, dl); 6246} 6247 6248SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 6249 SelectionDAG &DAG) const { 6250 EVT SrcVT = Op.getOperand(0).getValueType(); 6251 6252 if (SrcVT.isVector()) 6253 return SDValue(); 6254 6255 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 6256 "Unknown SINT_TO_FP to lower!"); 6257 6258 // These are really Legal; return the operand so the caller accepts it as 6259 // Legal. 6260 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 6261 return Op; 6262 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 6263 Subtarget->is64Bit()) { 6264 return Op; 6265 } 6266 6267 DebugLoc dl = Op.getDebugLoc(); 6268 unsigned Size = SrcVT.getSizeInBits()/8; 6269 MachineFunction &MF = DAG.getMachineFunction(); 6270 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 6271 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6272 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6273 StackSlot, 6274 MachinePointerInfo::getFixedStack(SSFI), 6275 false, false, 0); 6276 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 6277} 6278 6279SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 6280 SDValue StackSlot, 6281 SelectionDAG &DAG) const { 6282 // Build the FILD 6283 DebugLoc DL = Op.getDebugLoc(); 6284 SDVTList Tys; 6285 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 6286 if (useSSE) 6287 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 6288 else 6289 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 6290 6291 unsigned ByteSize = SrcVT.getSizeInBits()/8; 6292 6293 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6294 MachineMemOperand *MMO = 6295 DAG.getMachineFunction() 6296 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6297 MachineMemOperand::MOLoad, ByteSize, ByteSize); 6298 6299 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 6300 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 6301 X86ISD::FILD, DL, 6302 Tys, Ops, array_lengthof(Ops), 6303 SrcVT, MMO); 6304 6305 if (useSSE) { 6306 Chain = Result.getValue(1); 6307 SDValue InFlag = Result.getValue(2); 6308 6309 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 6310 // shouldn't be necessary except that RFP cannot be live across 6311 // multiple blocks. When stackifier is fixed, they can be uncoupled. 6312 MachineFunction &MF = DAG.getMachineFunction(); 6313 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 6314 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 6315 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6316 Tys = DAG.getVTList(MVT::Other); 6317 SDValue Ops[] = { 6318 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 6319 }; 6320 MachineMemOperand *MMO = 6321 DAG.getMachineFunction() 6322 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6323 MachineMemOperand::MOStore, SSFISize, SSFISize); 6324 6325 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 6326 Ops, array_lengthof(Ops), 6327 Op.getValueType(), MMO); 6328 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 6329 MachinePointerInfo::getFixedStack(SSFI), 6330 false, false, 0); 6331 } 6332 6333 return Result; 6334} 6335 6336// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 6337SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 6338 SelectionDAG &DAG) const { 6339 // This algorithm is not obvious. Here it is in C code, more or less: 6340 /* 6341 double uint64_to_double( uint32_t hi, uint32_t lo ) { 6342 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 6343 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 6344 6345 // Copy ints to xmm registers. 6346 __m128i xh = _mm_cvtsi32_si128( hi ); 6347 __m128i xl = _mm_cvtsi32_si128( lo ); 6348 6349 // Combine into low half of a single xmm register. 6350 __m128i x = _mm_unpacklo_epi32( xh, xl ); 6351 __m128d d; 6352 double sd; 6353 6354 // Merge in appropriate exponents to give the integer bits the right 6355 // magnitude. 6356 x = _mm_unpacklo_epi32( x, exp ); 6357 6358 // Subtract away the biases to deal with the IEEE-754 double precision 6359 // implicit 1. 6360 d = _mm_sub_pd( (__m128d) x, bias ); 6361 6362 // All conversions up to here are exact. The correctly rounded result is 6363 // calculated using the current rounding mode using the following 6364 // horizontal add. 6365 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 6366 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 6367 // store doesn't really need to be here (except 6368 // maybe to zero the other double) 6369 return sd; 6370 } 6371 */ 6372 6373 DebugLoc dl = Op.getDebugLoc(); 6374 LLVMContext *Context = DAG.getContext(); 6375 6376 // Build some magic constants. 6377 std::vector<Constant*> CV0; 6378 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 6379 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 6380 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6381 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6382 Constant *C0 = ConstantVector::get(CV0); 6383 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 6384 6385 std::vector<Constant*> CV1; 6386 CV1.push_back( 6387 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 6388 CV1.push_back( 6389 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 6390 Constant *C1 = ConstantVector::get(CV1); 6391 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 6392 6393 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6394 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6395 Op.getOperand(0), 6396 DAG.getIntPtrConstant(1))); 6397 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6398 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6399 Op.getOperand(0), 6400 DAG.getIntPtrConstant(0))); 6401 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 6402 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 6403 MachinePointerInfo::getConstantPool(), 6404 false, false, 16); 6405 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 6406 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 6407 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 6408 MachinePointerInfo::getConstantPool(), 6409 false, false, 16); 6410 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 6411 6412 // Add the halves; easiest way is to swap them into another reg first. 6413 int ShufMask[2] = { 1, -1 }; 6414 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 6415 DAG.getUNDEF(MVT::v2f64), ShufMask); 6416 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 6417 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 6418 DAG.getIntPtrConstant(0)); 6419} 6420 6421// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 6422SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 6423 SelectionDAG &DAG) const { 6424 DebugLoc dl = Op.getDebugLoc(); 6425 // FP constant to bias correct the final result. 6426 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 6427 MVT::f64); 6428 6429 // Load the 32-bit value into an XMM register. 6430 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6431 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6432 Op.getOperand(0), 6433 DAG.getIntPtrConstant(0))); 6434 6435 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6436 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 6437 DAG.getIntPtrConstant(0)); 6438 6439 // Or the load with the bias. 6440 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 6441 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6442 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6443 MVT::v2f64, Load)), 6444 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6445 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6446 MVT::v2f64, Bias))); 6447 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6448 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 6449 DAG.getIntPtrConstant(0)); 6450 6451 // Subtract the bias. 6452 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 6453 6454 // Handle final rounding. 6455 EVT DestVT = Op.getValueType(); 6456 6457 if (DestVT.bitsLT(MVT::f64)) { 6458 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 6459 DAG.getIntPtrConstant(0)); 6460 } else if (DestVT.bitsGT(MVT::f64)) { 6461 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 6462 } 6463 6464 // Handle final rounding. 6465 return Sub; 6466} 6467 6468SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 6469 SelectionDAG &DAG) const { 6470 SDValue N0 = Op.getOperand(0); 6471 DebugLoc dl = Op.getDebugLoc(); 6472 6473 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 6474 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 6475 // the optimization here. 6476 if (DAG.SignBitIsZero(N0)) 6477 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 6478 6479 EVT SrcVT = N0.getValueType(); 6480 EVT DstVT = Op.getValueType(); 6481 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 6482 return LowerUINT_TO_FP_i64(Op, DAG); 6483 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 6484 return LowerUINT_TO_FP_i32(Op, DAG); 6485 6486 // Make a 64-bit buffer, and use it to build an FILD. 6487 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 6488 if (SrcVT == MVT::i32) { 6489 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 6490 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 6491 getPointerTy(), StackSlot, WordOff); 6492 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6493 StackSlot, MachinePointerInfo(), 6494 false, false, 0); 6495 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 6496 OffsetSlot, MachinePointerInfo(), 6497 false, false, 0); 6498 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 6499 return Fild; 6500 } 6501 6502 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 6503 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6504 StackSlot, MachinePointerInfo(), 6505 false, false, 0); 6506 // For i64 source, we need to add the appropriate power of 2 if the input 6507 // was negative. This is the same as the optimization in 6508 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 6509 // we must be careful to do the computation in x87 extended precision, not 6510 // in SSE. (The generic code can't know it's OK to do this, or how to.) 6511 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6512 MachineMemOperand *MMO = 6513 DAG.getMachineFunction() 6514 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6515 MachineMemOperand::MOLoad, 8, 8); 6516 6517 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 6518 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 6519 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 6520 MVT::i64, MMO); 6521 6522 APInt FF(32, 0x5F800000ULL); 6523 6524 // Check whether the sign bit is set. 6525 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 6526 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 6527 ISD::SETLT); 6528 6529 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 6530 SDValue FudgePtr = DAG.getConstantPool( 6531 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 6532 getPointerTy()); 6533 6534 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 6535 SDValue Zero = DAG.getIntPtrConstant(0); 6536 SDValue Four = DAG.getIntPtrConstant(4); 6537 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 6538 Zero, Four); 6539 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 6540 6541 // Load the value out, extending it from f32 to f80. 6542 // FIXME: Avoid the extend by constructing the right constant pool? 6543 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 6544 FudgePtr, MachinePointerInfo::getConstantPool(), 6545 MVT::f32, false, false, 4); 6546 // Extend everything to 80 bits to force it to be done on x87. 6547 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 6548 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 6549} 6550 6551std::pair<SDValue,SDValue> X86TargetLowering:: 6552FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 6553 DebugLoc DL = Op.getDebugLoc(); 6554 6555 EVT DstTy = Op.getValueType(); 6556 6557 if (!IsSigned) { 6558 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 6559 DstTy = MVT::i64; 6560 } 6561 6562 assert(DstTy.getSimpleVT() <= MVT::i64 && 6563 DstTy.getSimpleVT() >= MVT::i16 && 6564 "Unknown FP_TO_SINT to lower!"); 6565 6566 // These are really Legal. 6567 if (DstTy == MVT::i32 && 6568 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6569 return std::make_pair(SDValue(), SDValue()); 6570 if (Subtarget->is64Bit() && 6571 DstTy == MVT::i64 && 6572 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6573 return std::make_pair(SDValue(), SDValue()); 6574 6575 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 6576 // stack slot. 6577 MachineFunction &MF = DAG.getMachineFunction(); 6578 unsigned MemSize = DstTy.getSizeInBits()/8; 6579 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6580 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6581 6582 6583 6584 unsigned Opc; 6585 switch (DstTy.getSimpleVT().SimpleTy) { 6586 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 6587 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 6588 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 6589 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 6590 } 6591 6592 SDValue Chain = DAG.getEntryNode(); 6593 SDValue Value = Op.getOperand(0); 6594 EVT TheVT = Op.getOperand(0).getValueType(); 6595 if (isScalarFPTypeInSSEReg(TheVT)) { 6596 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 6597 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 6598 MachinePointerInfo::getFixedStack(SSFI), 6599 false, false, 0); 6600 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 6601 SDValue Ops[] = { 6602 Chain, StackSlot, DAG.getValueType(TheVT) 6603 }; 6604 6605 MachineMemOperand *MMO = 6606 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6607 MachineMemOperand::MOLoad, MemSize, MemSize); 6608 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 6609 DstTy, MMO); 6610 Chain = Value.getValue(1); 6611 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6612 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6613 } 6614 6615 MachineMemOperand *MMO = 6616 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6617 MachineMemOperand::MOStore, MemSize, MemSize); 6618 6619 // Build the FP_TO_INT*_IN_MEM 6620 SDValue Ops[] = { Chain, Value, StackSlot }; 6621 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 6622 Ops, 3, DstTy, MMO); 6623 6624 return std::make_pair(FIST, StackSlot); 6625} 6626 6627SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 6628 SelectionDAG &DAG) const { 6629 if (Op.getValueType().isVector()) 6630 return SDValue(); 6631 6632 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 6633 SDValue FIST = Vals.first, StackSlot = Vals.second; 6634 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 6635 if (FIST.getNode() == 0) return Op; 6636 6637 // Load the result. 6638 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6639 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 6640} 6641 6642SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 6643 SelectionDAG &DAG) const { 6644 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 6645 SDValue FIST = Vals.first, StackSlot = Vals.second; 6646 assert(FIST.getNode() && "Unexpected failure"); 6647 6648 // Load the result. 6649 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6650 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 6651} 6652 6653SDValue X86TargetLowering::LowerFABS(SDValue Op, 6654 SelectionDAG &DAG) const { 6655 LLVMContext *Context = DAG.getContext(); 6656 DebugLoc dl = Op.getDebugLoc(); 6657 EVT VT = Op.getValueType(); 6658 EVT EltVT = VT; 6659 if (VT.isVector()) 6660 EltVT = VT.getVectorElementType(); 6661 std::vector<Constant*> CV; 6662 if (EltVT == MVT::f64) { 6663 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 6664 CV.push_back(C); 6665 CV.push_back(C); 6666 } else { 6667 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 6668 CV.push_back(C); 6669 CV.push_back(C); 6670 CV.push_back(C); 6671 CV.push_back(C); 6672 } 6673 Constant *C = ConstantVector::get(CV); 6674 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6675 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6676 MachinePointerInfo::getConstantPool(), 6677 false, false, 16); 6678 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 6679} 6680 6681SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 6682 LLVMContext *Context = DAG.getContext(); 6683 DebugLoc dl = Op.getDebugLoc(); 6684 EVT VT = Op.getValueType(); 6685 EVT EltVT = VT; 6686 if (VT.isVector()) 6687 EltVT = VT.getVectorElementType(); 6688 std::vector<Constant*> CV; 6689 if (EltVT == MVT::f64) { 6690 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6691 CV.push_back(C); 6692 CV.push_back(C); 6693 } else { 6694 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6695 CV.push_back(C); 6696 CV.push_back(C); 6697 CV.push_back(C); 6698 CV.push_back(C); 6699 } 6700 Constant *C = ConstantVector::get(CV); 6701 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6702 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6703 MachinePointerInfo::getConstantPool(), 6704 false, false, 16); 6705 if (VT.isVector()) { 6706 return DAG.getNode(ISD::BITCAST, dl, VT, 6707 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6708 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 6709 Op.getOperand(0)), 6710 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); 6711 } else { 6712 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6713 } 6714} 6715 6716SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6717 LLVMContext *Context = DAG.getContext(); 6718 SDValue Op0 = Op.getOperand(0); 6719 SDValue Op1 = Op.getOperand(1); 6720 DebugLoc dl = Op.getDebugLoc(); 6721 EVT VT = Op.getValueType(); 6722 EVT SrcVT = Op1.getValueType(); 6723 6724 // If second operand is smaller, extend it first. 6725 if (SrcVT.bitsLT(VT)) { 6726 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6727 SrcVT = VT; 6728 } 6729 // And if it is bigger, shrink it first. 6730 if (SrcVT.bitsGT(VT)) { 6731 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6732 SrcVT = VT; 6733 } 6734 6735 // At this point the operands and the result should have the same 6736 // type, and that won't be f80 since that is not custom lowered. 6737 6738 // First get the sign bit of second operand. 6739 std::vector<Constant*> CV; 6740 if (SrcVT == MVT::f64) { 6741 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6742 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6743 } else { 6744 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6745 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6746 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6747 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6748 } 6749 Constant *C = ConstantVector::get(CV); 6750 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6751 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6752 MachinePointerInfo::getConstantPool(), 6753 false, false, 16); 6754 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6755 6756 // Shift sign bit right or left if the two operands have different types. 6757 if (SrcVT.bitsGT(VT)) { 6758 // Op0 is MVT::f32, Op1 is MVT::f64. 6759 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6760 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6761 DAG.getConstant(32, MVT::i32)); 6762 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 6763 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6764 DAG.getIntPtrConstant(0)); 6765 } 6766 6767 // Clear first operand sign bit. 6768 CV.clear(); 6769 if (VT == MVT::f64) { 6770 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6771 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6772 } else { 6773 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6774 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6775 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6776 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6777 } 6778 C = ConstantVector::get(CV); 6779 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6780 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6781 MachinePointerInfo::getConstantPool(), 6782 false, false, 16); 6783 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6784 6785 // Or the value with the sign bit. 6786 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6787} 6788 6789/// Emit nodes that will be selected as "test Op0,Op0", or something 6790/// equivalent. 6791SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6792 SelectionDAG &DAG) const { 6793 DebugLoc dl = Op.getDebugLoc(); 6794 6795 // CF and OF aren't always set the way we want. Determine which 6796 // of these we need. 6797 bool NeedCF = false; 6798 bool NeedOF = false; 6799 switch (X86CC) { 6800 default: break; 6801 case X86::COND_A: case X86::COND_AE: 6802 case X86::COND_B: case X86::COND_BE: 6803 NeedCF = true; 6804 break; 6805 case X86::COND_G: case X86::COND_GE: 6806 case X86::COND_L: case X86::COND_LE: 6807 case X86::COND_O: case X86::COND_NO: 6808 NeedOF = true; 6809 break; 6810 } 6811 6812 // See if we can use the EFLAGS value from the operand instead of 6813 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6814 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6815 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6816 // Emit a CMP with 0, which is the TEST pattern. 6817 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6818 DAG.getConstant(0, Op.getValueType())); 6819 6820 unsigned Opcode = 0; 6821 unsigned NumOperands = 0; 6822 switch (Op.getNode()->getOpcode()) { 6823 case ISD::ADD: 6824 // Due to an isel shortcoming, be conservative if this add is likely to be 6825 // selected as part of a load-modify-store instruction. When the root node 6826 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6827 // uses of other nodes in the match, such as the ADD in this case. This 6828 // leads to the ADD being left around and reselected, with the result being 6829 // two adds in the output. Alas, even if none our users are stores, that 6830 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6831 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6832 // climbing the DAG back to the root, and it doesn't seem to be worth the 6833 // effort. 6834 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6835 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6836 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6837 goto default_case; 6838 6839 if (ConstantSDNode *C = 6840 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6841 // An add of one will be selected as an INC. 6842 if (C->getAPIntValue() == 1) { 6843 Opcode = X86ISD::INC; 6844 NumOperands = 1; 6845 break; 6846 } 6847 6848 // An add of negative one (subtract of one) will be selected as a DEC. 6849 if (C->getAPIntValue().isAllOnesValue()) { 6850 Opcode = X86ISD::DEC; 6851 NumOperands = 1; 6852 break; 6853 } 6854 } 6855 6856 // Otherwise use a regular EFLAGS-setting add. 6857 Opcode = X86ISD::ADD; 6858 NumOperands = 2; 6859 break; 6860 case ISD::AND: { 6861 // If the primary and result isn't used, don't bother using X86ISD::AND, 6862 // because a TEST instruction will be better. 6863 bool NonFlagUse = false; 6864 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6865 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6866 SDNode *User = *UI; 6867 unsigned UOpNo = UI.getOperandNo(); 6868 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6869 // Look pass truncate. 6870 UOpNo = User->use_begin().getOperandNo(); 6871 User = *User->use_begin(); 6872 } 6873 6874 if (User->getOpcode() != ISD::BRCOND && 6875 User->getOpcode() != ISD::SETCC && 6876 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6877 NonFlagUse = true; 6878 break; 6879 } 6880 } 6881 6882 if (!NonFlagUse) 6883 break; 6884 } 6885 // FALL THROUGH 6886 case ISD::SUB: 6887 case ISD::OR: 6888 case ISD::XOR: 6889 // Due to the ISEL shortcoming noted above, be conservative if this op is 6890 // likely to be selected as part of a load-modify-store instruction. 6891 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6892 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6893 if (UI->getOpcode() == ISD::STORE) 6894 goto default_case; 6895 6896 // Otherwise use a regular EFLAGS-setting instruction. 6897 switch (Op.getNode()->getOpcode()) { 6898 default: llvm_unreachable("unexpected operator!"); 6899 case ISD::SUB: Opcode = X86ISD::SUB; break; 6900 case ISD::OR: Opcode = X86ISD::OR; break; 6901 case ISD::XOR: Opcode = X86ISD::XOR; break; 6902 case ISD::AND: Opcode = X86ISD::AND; break; 6903 } 6904 6905 NumOperands = 2; 6906 break; 6907 case X86ISD::ADD: 6908 case X86ISD::SUB: 6909 case X86ISD::INC: 6910 case X86ISD::DEC: 6911 case X86ISD::OR: 6912 case X86ISD::XOR: 6913 case X86ISD::AND: 6914 return SDValue(Op.getNode(), 1); 6915 default: 6916 default_case: 6917 break; 6918 } 6919 6920 if (Opcode == 0) 6921 // Emit a CMP with 0, which is the TEST pattern. 6922 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6923 DAG.getConstant(0, Op.getValueType())); 6924 6925 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6926 SmallVector<SDValue, 4> Ops; 6927 for (unsigned i = 0; i != NumOperands; ++i) 6928 Ops.push_back(Op.getOperand(i)); 6929 6930 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6931 DAG.ReplaceAllUsesWith(Op, New); 6932 return SDValue(New.getNode(), 1); 6933} 6934 6935/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6936/// equivalent. 6937SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6938 SelectionDAG &DAG) const { 6939 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6940 if (C->getAPIntValue() == 0) 6941 return EmitTest(Op0, X86CC, DAG); 6942 6943 DebugLoc dl = Op0.getDebugLoc(); 6944 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6945} 6946 6947/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6948/// if it's possible. 6949SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6950 DebugLoc dl, SelectionDAG &DAG) const { 6951 SDValue Op0 = And.getOperand(0); 6952 SDValue Op1 = And.getOperand(1); 6953 if (Op0.getOpcode() == ISD::TRUNCATE) 6954 Op0 = Op0.getOperand(0); 6955 if (Op1.getOpcode() == ISD::TRUNCATE) 6956 Op1 = Op1.getOperand(0); 6957 6958 SDValue LHS, RHS; 6959 if (Op1.getOpcode() == ISD::SHL) 6960 std::swap(Op0, Op1); 6961 if (Op0.getOpcode() == ISD::SHL) { 6962 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6963 if (And00C->getZExtValue() == 1) { 6964 // If we looked past a truncate, check that it's only truncating away 6965 // known zeros. 6966 unsigned BitWidth = Op0.getValueSizeInBits(); 6967 unsigned AndBitWidth = And.getValueSizeInBits(); 6968 if (BitWidth > AndBitWidth) { 6969 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6970 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6971 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6972 return SDValue(); 6973 } 6974 LHS = Op1; 6975 RHS = Op0.getOperand(1); 6976 } 6977 } else if (Op1.getOpcode() == ISD::Constant) { 6978 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6979 SDValue AndLHS = Op0; 6980 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6981 LHS = AndLHS.getOperand(0); 6982 RHS = AndLHS.getOperand(1); 6983 } 6984 } 6985 6986 if (LHS.getNode()) { 6987 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6988 // instruction. Since the shift amount is in-range-or-undefined, we know 6989 // that doing a bittest on the i32 value is ok. We extend to i32 because 6990 // the encoding for the i16 version is larger than the i32 version. 6991 // Also promote i16 to i32 for performance / code size reason. 6992 if (LHS.getValueType() == MVT::i8 || 6993 LHS.getValueType() == MVT::i16) 6994 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6995 6996 // If the operand types disagree, extend the shift amount to match. Since 6997 // BT ignores high bits (like shifts) we can use anyextend. 6998 if (LHS.getValueType() != RHS.getValueType()) 6999 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 7000 7001 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 7002 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 7003 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7004 DAG.getConstant(Cond, MVT::i8), BT); 7005 } 7006 7007 return SDValue(); 7008} 7009 7010SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 7011 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 7012 SDValue Op0 = Op.getOperand(0); 7013 SDValue Op1 = Op.getOperand(1); 7014 DebugLoc dl = Op.getDebugLoc(); 7015 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7016 7017 // Optimize to BT if possible. 7018 // Lower (X & (1 << N)) == 0 to BT(X, N). 7019 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 7020 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 7021 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 7022 Op1.getOpcode() == ISD::Constant && 7023 cast<ConstantSDNode>(Op1)->isNullValue() && 7024 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7025 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 7026 if (NewSetCC.getNode()) 7027 return NewSetCC; 7028 } 7029 7030 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 7031 // these. 7032 if (Op1.getOpcode() == ISD::Constant && 7033 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 7034 cast<ConstantSDNode>(Op1)->isNullValue()) && 7035 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7036 7037 // If the input is a setcc, then reuse the input setcc or use a new one with 7038 // the inverted condition. 7039 if (Op0.getOpcode() == X86ISD::SETCC) { 7040 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 7041 bool Invert = (CC == ISD::SETNE) ^ 7042 cast<ConstantSDNode>(Op1)->isNullValue(); 7043 if (!Invert) return Op0; 7044 7045 CCode = X86::GetOppositeBranchCondition(CCode); 7046 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7047 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 7048 } 7049 } 7050 7051 bool isFP = Op1.getValueType().isFloatingPoint(); 7052 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 7053 if (X86CC == X86::COND_INVALID) 7054 return SDValue(); 7055 7056 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 7057 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7058 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 7059} 7060 7061SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 7062 SDValue Cond; 7063 SDValue Op0 = Op.getOperand(0); 7064 SDValue Op1 = Op.getOperand(1); 7065 SDValue CC = Op.getOperand(2); 7066 EVT VT = Op.getValueType(); 7067 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 7068 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 7069 DebugLoc dl = Op.getDebugLoc(); 7070 7071 if (isFP) { 7072 unsigned SSECC = 8; 7073 EVT VT0 = Op0.getValueType(); 7074 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 7075 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 7076 bool Swap = false; 7077 7078 switch (SetCCOpcode) { 7079 default: break; 7080 case ISD::SETOEQ: 7081 case ISD::SETEQ: SSECC = 0; break; 7082 case ISD::SETOGT: 7083 case ISD::SETGT: Swap = true; // Fallthrough 7084 case ISD::SETLT: 7085 case ISD::SETOLT: SSECC = 1; break; 7086 case ISD::SETOGE: 7087 case ISD::SETGE: Swap = true; // Fallthrough 7088 case ISD::SETLE: 7089 case ISD::SETOLE: SSECC = 2; break; 7090 case ISD::SETUO: SSECC = 3; break; 7091 case ISD::SETUNE: 7092 case ISD::SETNE: SSECC = 4; break; 7093 case ISD::SETULE: Swap = true; 7094 case ISD::SETUGE: SSECC = 5; break; 7095 case ISD::SETULT: Swap = true; 7096 case ISD::SETUGT: SSECC = 6; break; 7097 case ISD::SETO: SSECC = 7; break; 7098 } 7099 if (Swap) 7100 std::swap(Op0, Op1); 7101 7102 // In the two special cases we can't handle, emit two comparisons. 7103 if (SSECC == 8) { 7104 if (SetCCOpcode == ISD::SETUEQ) { 7105 SDValue UNORD, EQ; 7106 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 7107 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 7108 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 7109 } 7110 else if (SetCCOpcode == ISD::SETONE) { 7111 SDValue ORD, NEQ; 7112 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 7113 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 7114 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 7115 } 7116 llvm_unreachable("Illegal FP comparison"); 7117 } 7118 // Handle all other FP comparisons here. 7119 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 7120 } 7121 7122 // We are handling one of the integer comparisons here. Since SSE only has 7123 // GT and EQ comparisons for integer, swapping operands and multiple 7124 // operations may be required for some comparisons. 7125 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 7126 bool Swap = false, Invert = false, FlipSigns = false; 7127 7128 switch (VT.getSimpleVT().SimpleTy) { 7129 default: break; 7130 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 7131 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 7132 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 7133 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 7134 } 7135 7136 switch (SetCCOpcode) { 7137 default: break; 7138 case ISD::SETNE: Invert = true; 7139 case ISD::SETEQ: Opc = EQOpc; break; 7140 case ISD::SETLT: Swap = true; 7141 case ISD::SETGT: Opc = GTOpc; break; 7142 case ISD::SETGE: Swap = true; 7143 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 7144 case ISD::SETULT: Swap = true; 7145 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 7146 case ISD::SETUGE: Swap = true; 7147 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 7148 } 7149 if (Swap) 7150 std::swap(Op0, Op1); 7151 7152 // Since SSE has no unsigned integer comparisons, we need to flip the sign 7153 // bits of the inputs before performing those operations. 7154 if (FlipSigns) { 7155 EVT EltVT = VT.getVectorElementType(); 7156 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 7157 EltVT); 7158 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 7159 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 7160 SignBits.size()); 7161 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 7162 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 7163 } 7164 7165 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 7166 7167 // If the logical-not of the result is required, perform that now. 7168 if (Invert) 7169 Result = DAG.getNOT(dl, Result, VT); 7170 7171 return Result; 7172} 7173 7174// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 7175static bool isX86LogicalCmp(SDValue Op) { 7176 unsigned Opc = Op.getNode()->getOpcode(); 7177 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 7178 return true; 7179 if (Op.getResNo() == 1 && 7180 (Opc == X86ISD::ADD || 7181 Opc == X86ISD::SUB || 7182 Opc == X86ISD::SMUL || 7183 Opc == X86ISD::UMUL || 7184 Opc == X86ISD::INC || 7185 Opc == X86ISD::DEC || 7186 Opc == X86ISD::OR || 7187 Opc == X86ISD::XOR || 7188 Opc == X86ISD::AND)) 7189 return true; 7190 7191 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 7192 return true; 7193 7194 return false; 7195} 7196 7197static bool isZero(SDValue V) { 7198 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7199 return C && C->isNullValue(); 7200} 7201 7202static bool isAllOnes(SDValue V) { 7203 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7204 return C && C->isAllOnesValue(); 7205} 7206 7207SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 7208 bool addTest = true; 7209 SDValue Cond = Op.getOperand(0); 7210 SDValue Op1 = Op.getOperand(1); 7211 SDValue Op2 = Op.getOperand(2); 7212 DebugLoc DL = Op.getDebugLoc(); 7213 SDValue CC; 7214 7215 if (Cond.getOpcode() == ISD::SETCC) { 7216 SDValue NewCond = LowerSETCC(Cond, DAG); 7217 if (NewCond.getNode()) 7218 Cond = NewCond; 7219 } 7220 7221 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 7222 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 7223 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 7224 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 7225 if (Cond.getOpcode() == X86ISD::SETCC && 7226 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 7227 isZero(Cond.getOperand(1).getOperand(1))) { 7228 SDValue Cmp = Cond.getOperand(1); 7229 7230 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 7231 7232 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 7233 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 7234 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 7235 7236 SDValue CmpOp0 = Cmp.getOperand(0); 7237 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 7238 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 7239 7240 SDValue Res = // Res = 0 or -1. 7241 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7242 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 7243 7244 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 7245 Res = DAG.getNOT(DL, Res, Res.getValueType()); 7246 7247 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 7248 if (N2C == 0 || !N2C->isNullValue()) 7249 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 7250 return Res; 7251 } 7252 } 7253 7254 // Look past (and (setcc_carry (cmp ...)), 1). 7255 if (Cond.getOpcode() == ISD::AND && 7256 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7257 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7258 if (C && C->getAPIntValue() == 1) 7259 Cond = Cond.getOperand(0); 7260 } 7261 7262 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7263 // setting operand in place of the X86ISD::SETCC. 7264 if (Cond.getOpcode() == X86ISD::SETCC || 7265 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7266 CC = Cond.getOperand(0); 7267 7268 SDValue Cmp = Cond.getOperand(1); 7269 unsigned Opc = Cmp.getOpcode(); 7270 EVT VT = Op.getValueType(); 7271 7272 bool IllegalFPCMov = false; 7273 if (VT.isFloatingPoint() && !VT.isVector() && 7274 !isScalarFPTypeInSSEReg(VT)) // FPStack? 7275 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 7276 7277 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 7278 Opc == X86ISD::BT) { // FIXME 7279 Cond = Cmp; 7280 addTest = false; 7281 } 7282 } 7283 7284 if (addTest) { 7285 // Look pass the truncate. 7286 if (Cond.getOpcode() == ISD::TRUNCATE) 7287 Cond = Cond.getOperand(0); 7288 7289 // We know the result of AND is compared against zero. Try to match 7290 // it to BT. 7291 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7292 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 7293 if (NewSetCC.getNode()) { 7294 CC = NewSetCC.getOperand(0); 7295 Cond = NewSetCC.getOperand(1); 7296 addTest = false; 7297 } 7298 } 7299 } 7300 7301 if (addTest) { 7302 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7303 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7304 } 7305 7306 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 7307 // condition is true. 7308 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 7309 SDValue Ops[] = { Op2, Op1, CC, Cond }; 7310 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 7311} 7312 7313// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 7314// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 7315// from the AND / OR. 7316static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 7317 Opc = Op.getOpcode(); 7318 if (Opc != ISD::OR && Opc != ISD::AND) 7319 return false; 7320 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7321 Op.getOperand(0).hasOneUse() && 7322 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 7323 Op.getOperand(1).hasOneUse()); 7324} 7325 7326// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 7327// 1 and that the SETCC node has a single use. 7328static bool isXor1OfSetCC(SDValue Op) { 7329 if (Op.getOpcode() != ISD::XOR) 7330 return false; 7331 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7332 if (N1C && N1C->getAPIntValue() == 1) { 7333 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7334 Op.getOperand(0).hasOneUse(); 7335 } 7336 return false; 7337} 7338 7339SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 7340 bool addTest = true; 7341 SDValue Chain = Op.getOperand(0); 7342 SDValue Cond = Op.getOperand(1); 7343 SDValue Dest = Op.getOperand(2); 7344 DebugLoc dl = Op.getDebugLoc(); 7345 SDValue CC; 7346 7347 if (Cond.getOpcode() == ISD::SETCC) { 7348 SDValue NewCond = LowerSETCC(Cond, DAG); 7349 if (NewCond.getNode()) 7350 Cond = NewCond; 7351 } 7352#if 0 7353 // FIXME: LowerXALUO doesn't handle these!! 7354 else if (Cond.getOpcode() == X86ISD::ADD || 7355 Cond.getOpcode() == X86ISD::SUB || 7356 Cond.getOpcode() == X86ISD::SMUL || 7357 Cond.getOpcode() == X86ISD::UMUL) 7358 Cond = LowerXALUO(Cond, DAG); 7359#endif 7360 7361 // Look pass (and (setcc_carry (cmp ...)), 1). 7362 if (Cond.getOpcode() == ISD::AND && 7363 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7364 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7365 if (C && C->getAPIntValue() == 1) 7366 Cond = Cond.getOperand(0); 7367 } 7368 7369 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7370 // setting operand in place of the X86ISD::SETCC. 7371 if (Cond.getOpcode() == X86ISD::SETCC || 7372 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7373 CC = Cond.getOperand(0); 7374 7375 SDValue Cmp = Cond.getOperand(1); 7376 unsigned Opc = Cmp.getOpcode(); 7377 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 7378 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 7379 Cond = Cmp; 7380 addTest = false; 7381 } else { 7382 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 7383 default: break; 7384 case X86::COND_O: 7385 case X86::COND_B: 7386 // These can only come from an arithmetic instruction with overflow, 7387 // e.g. SADDO, UADDO. 7388 Cond = Cond.getNode()->getOperand(1); 7389 addTest = false; 7390 break; 7391 } 7392 } 7393 } else { 7394 unsigned CondOpc; 7395 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 7396 SDValue Cmp = Cond.getOperand(0).getOperand(1); 7397 if (CondOpc == ISD::OR) { 7398 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 7399 // two branches instead of an explicit OR instruction with a 7400 // separate test. 7401 if (Cmp == Cond.getOperand(1).getOperand(1) && 7402 isX86LogicalCmp(Cmp)) { 7403 CC = Cond.getOperand(0).getOperand(0); 7404 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7405 Chain, Dest, CC, Cmp); 7406 CC = Cond.getOperand(1).getOperand(0); 7407 Cond = Cmp; 7408 addTest = false; 7409 } 7410 } else { // ISD::AND 7411 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 7412 // two branches instead of an explicit AND instruction with a 7413 // separate test. However, we only do this if this block doesn't 7414 // have a fall-through edge, because this requires an explicit 7415 // jmp when the condition is false. 7416 if (Cmp == Cond.getOperand(1).getOperand(1) && 7417 isX86LogicalCmp(Cmp) && 7418 Op.getNode()->hasOneUse()) { 7419 X86::CondCode CCode = 7420 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7421 CCode = X86::GetOppositeBranchCondition(CCode); 7422 CC = DAG.getConstant(CCode, MVT::i8); 7423 SDNode *User = *Op.getNode()->use_begin(); 7424 // Look for an unconditional branch following this conditional branch. 7425 // We need this because we need to reverse the successors in order 7426 // to implement FCMP_OEQ. 7427 if (User->getOpcode() == ISD::BR) { 7428 SDValue FalseBB = User->getOperand(1); 7429 SDNode *NewBR = 7430 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 7431 assert(NewBR == User); 7432 (void)NewBR; 7433 Dest = FalseBB; 7434 7435 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7436 Chain, Dest, CC, Cmp); 7437 X86::CondCode CCode = 7438 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 7439 CCode = X86::GetOppositeBranchCondition(CCode); 7440 CC = DAG.getConstant(CCode, MVT::i8); 7441 Cond = Cmp; 7442 addTest = false; 7443 } 7444 } 7445 } 7446 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 7447 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 7448 // It should be transformed during dag combiner except when the condition 7449 // is set by a arithmetics with overflow node. 7450 X86::CondCode CCode = 7451 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7452 CCode = X86::GetOppositeBranchCondition(CCode); 7453 CC = DAG.getConstant(CCode, MVT::i8); 7454 Cond = Cond.getOperand(0).getOperand(1); 7455 addTest = false; 7456 } 7457 } 7458 7459 if (addTest) { 7460 // Look pass the truncate. 7461 if (Cond.getOpcode() == ISD::TRUNCATE) 7462 Cond = Cond.getOperand(0); 7463 7464 // We know the result of AND is compared against zero. Try to match 7465 // it to BT. 7466 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7467 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7468 if (NewSetCC.getNode()) { 7469 CC = NewSetCC.getOperand(0); 7470 Cond = NewSetCC.getOperand(1); 7471 addTest = false; 7472 } 7473 } 7474 } 7475 7476 if (addTest) { 7477 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7478 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7479 } 7480 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7481 Chain, Dest, CC, Cond); 7482} 7483 7484 7485// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 7486// Calls to _alloca is needed to probe the stack when allocating more than 4k 7487// bytes in one go. Touching the stack at 4K increments is necessary to ensure 7488// that the guard pages used by the OS virtual memory manager are allocated in 7489// correct sequence. 7490SDValue 7491X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 7492 SelectionDAG &DAG) const { 7493 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 7494 "This should be used only on Windows targets"); 7495 DebugLoc dl = Op.getDebugLoc(); 7496 7497 // Get the inputs. 7498 SDValue Chain = Op.getOperand(0); 7499 SDValue Size = Op.getOperand(1); 7500 // FIXME: Ensure alignment here 7501 7502 SDValue Flag; 7503 7504 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 7505 7506 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 7507 Flag = Chain.getValue(1); 7508 7509 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 7510 7511 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 7512 Flag = Chain.getValue(1); 7513 7514 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 7515 7516 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 7517 return DAG.getMergeValues(Ops1, 2, dl); 7518} 7519 7520SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 7521 MachineFunction &MF = DAG.getMachineFunction(); 7522 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 7523 7524 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7525 DebugLoc DL = Op.getDebugLoc(); 7526 7527 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 7528 // vastart just stores the address of the VarArgsFrameIndex slot into the 7529 // memory location argument. 7530 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7531 getPointerTy()); 7532 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 7533 MachinePointerInfo(SV), false, false, 0); 7534 } 7535 7536 // __va_list_tag: 7537 // gp_offset (0 - 6 * 8) 7538 // fp_offset (48 - 48 + 8 * 16) 7539 // overflow_arg_area (point to parameters coming in memory). 7540 // reg_save_area 7541 SmallVector<SDValue, 8> MemOps; 7542 SDValue FIN = Op.getOperand(1); 7543 // Store gp_offset 7544 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 7545 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 7546 MVT::i32), 7547 FIN, MachinePointerInfo(SV), false, false, 0); 7548 MemOps.push_back(Store); 7549 7550 // Store fp_offset 7551 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7552 FIN, DAG.getIntPtrConstant(4)); 7553 Store = DAG.getStore(Op.getOperand(0), DL, 7554 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 7555 MVT::i32), 7556 FIN, MachinePointerInfo(SV, 4), false, false, 0); 7557 MemOps.push_back(Store); 7558 7559 // Store ptr to overflow_arg_area 7560 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7561 FIN, DAG.getIntPtrConstant(4)); 7562 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7563 getPointerTy()); 7564 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 7565 MachinePointerInfo(SV, 8), 7566 false, false, 0); 7567 MemOps.push_back(Store); 7568 7569 // Store ptr to reg_save_area. 7570 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7571 FIN, DAG.getIntPtrConstant(8)); 7572 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 7573 getPointerTy()); 7574 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 7575 MachinePointerInfo(SV, 16), false, false, 0); 7576 MemOps.push_back(Store); 7577 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 7578 &MemOps[0], MemOps.size()); 7579} 7580 7581SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 7582 assert(Subtarget->is64Bit() && 7583 "LowerVAARG only handles 64-bit va_arg!"); 7584 assert((Subtarget->isTargetLinux() || 7585 Subtarget->isTargetDarwin()) && 7586 "Unhandled target in LowerVAARG"); 7587 assert(Op.getNode()->getNumOperands() == 4); 7588 SDValue Chain = Op.getOperand(0); 7589 SDValue SrcPtr = Op.getOperand(1); 7590 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7591 unsigned Align = Op.getConstantOperandVal(3); 7592 DebugLoc dl = Op.getDebugLoc(); 7593 7594 EVT ArgVT = Op.getNode()->getValueType(0); 7595 const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 7596 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 7597 uint8_t ArgMode; 7598 7599 // Decide which area this value should be read from. 7600 // TODO: Implement the AMD64 ABI in its entirety. This simple 7601 // selection mechanism works only for the basic types. 7602 if (ArgVT == MVT::f80) { 7603 llvm_unreachable("va_arg for f80 not yet implemented"); 7604 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 7605 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 7606 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 7607 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 7608 } else { 7609 llvm_unreachable("Unhandled argument type in LowerVAARG"); 7610 } 7611 7612 if (ArgMode == 2) { 7613 // Sanity Check: Make sure using fp_offset makes sense. 7614 assert(!UseSoftFloat && 7615 !(DAG.getMachineFunction() 7616 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 7617 Subtarget->hasXMM()); 7618 } 7619 7620 // Insert VAARG_64 node into the DAG 7621 // VAARG_64 returns two values: Variable Argument Address, Chain 7622 SmallVector<SDValue, 11> InstOps; 7623 InstOps.push_back(Chain); 7624 InstOps.push_back(SrcPtr); 7625 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 7626 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 7627 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 7628 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 7629 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 7630 VTs, &InstOps[0], InstOps.size(), 7631 MVT::i64, 7632 MachinePointerInfo(SV), 7633 /*Align=*/0, 7634 /*Volatile=*/false, 7635 /*ReadMem=*/true, 7636 /*WriteMem=*/true); 7637 Chain = VAARG.getValue(1); 7638 7639 // Load the next argument and return it 7640 return DAG.getLoad(ArgVT, dl, 7641 Chain, 7642 VAARG, 7643 MachinePointerInfo(), 7644 false, false, 0); 7645} 7646 7647SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 7648 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 7649 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 7650 SDValue Chain = Op.getOperand(0); 7651 SDValue DstPtr = Op.getOperand(1); 7652 SDValue SrcPtr = Op.getOperand(2); 7653 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 7654 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7655 DebugLoc DL = Op.getDebugLoc(); 7656 7657 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 7658 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 7659 false, 7660 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 7661} 7662 7663SDValue 7664X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 7665 DebugLoc dl = Op.getDebugLoc(); 7666 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7667 switch (IntNo) { 7668 default: return SDValue(); // Don't custom lower most intrinsics. 7669 // Comparison intrinsics. 7670 case Intrinsic::x86_sse_comieq_ss: 7671 case Intrinsic::x86_sse_comilt_ss: 7672 case Intrinsic::x86_sse_comile_ss: 7673 case Intrinsic::x86_sse_comigt_ss: 7674 case Intrinsic::x86_sse_comige_ss: 7675 case Intrinsic::x86_sse_comineq_ss: 7676 case Intrinsic::x86_sse_ucomieq_ss: 7677 case Intrinsic::x86_sse_ucomilt_ss: 7678 case Intrinsic::x86_sse_ucomile_ss: 7679 case Intrinsic::x86_sse_ucomigt_ss: 7680 case Intrinsic::x86_sse_ucomige_ss: 7681 case Intrinsic::x86_sse_ucomineq_ss: 7682 case Intrinsic::x86_sse2_comieq_sd: 7683 case Intrinsic::x86_sse2_comilt_sd: 7684 case Intrinsic::x86_sse2_comile_sd: 7685 case Intrinsic::x86_sse2_comigt_sd: 7686 case Intrinsic::x86_sse2_comige_sd: 7687 case Intrinsic::x86_sse2_comineq_sd: 7688 case Intrinsic::x86_sse2_ucomieq_sd: 7689 case Intrinsic::x86_sse2_ucomilt_sd: 7690 case Intrinsic::x86_sse2_ucomile_sd: 7691 case Intrinsic::x86_sse2_ucomigt_sd: 7692 case Intrinsic::x86_sse2_ucomige_sd: 7693 case Intrinsic::x86_sse2_ucomineq_sd: { 7694 unsigned Opc = 0; 7695 ISD::CondCode CC = ISD::SETCC_INVALID; 7696 switch (IntNo) { 7697 default: break; 7698 case Intrinsic::x86_sse_comieq_ss: 7699 case Intrinsic::x86_sse2_comieq_sd: 7700 Opc = X86ISD::COMI; 7701 CC = ISD::SETEQ; 7702 break; 7703 case Intrinsic::x86_sse_comilt_ss: 7704 case Intrinsic::x86_sse2_comilt_sd: 7705 Opc = X86ISD::COMI; 7706 CC = ISD::SETLT; 7707 break; 7708 case Intrinsic::x86_sse_comile_ss: 7709 case Intrinsic::x86_sse2_comile_sd: 7710 Opc = X86ISD::COMI; 7711 CC = ISD::SETLE; 7712 break; 7713 case Intrinsic::x86_sse_comigt_ss: 7714 case Intrinsic::x86_sse2_comigt_sd: 7715 Opc = X86ISD::COMI; 7716 CC = ISD::SETGT; 7717 break; 7718 case Intrinsic::x86_sse_comige_ss: 7719 case Intrinsic::x86_sse2_comige_sd: 7720 Opc = X86ISD::COMI; 7721 CC = ISD::SETGE; 7722 break; 7723 case Intrinsic::x86_sse_comineq_ss: 7724 case Intrinsic::x86_sse2_comineq_sd: 7725 Opc = X86ISD::COMI; 7726 CC = ISD::SETNE; 7727 break; 7728 case Intrinsic::x86_sse_ucomieq_ss: 7729 case Intrinsic::x86_sse2_ucomieq_sd: 7730 Opc = X86ISD::UCOMI; 7731 CC = ISD::SETEQ; 7732 break; 7733 case Intrinsic::x86_sse_ucomilt_ss: 7734 case Intrinsic::x86_sse2_ucomilt_sd: 7735 Opc = X86ISD::UCOMI; 7736 CC = ISD::SETLT; 7737 break; 7738 case Intrinsic::x86_sse_ucomile_ss: 7739 case Intrinsic::x86_sse2_ucomile_sd: 7740 Opc = X86ISD::UCOMI; 7741 CC = ISD::SETLE; 7742 break; 7743 case Intrinsic::x86_sse_ucomigt_ss: 7744 case Intrinsic::x86_sse2_ucomigt_sd: 7745 Opc = X86ISD::UCOMI; 7746 CC = ISD::SETGT; 7747 break; 7748 case Intrinsic::x86_sse_ucomige_ss: 7749 case Intrinsic::x86_sse2_ucomige_sd: 7750 Opc = X86ISD::UCOMI; 7751 CC = ISD::SETGE; 7752 break; 7753 case Intrinsic::x86_sse_ucomineq_ss: 7754 case Intrinsic::x86_sse2_ucomineq_sd: 7755 Opc = X86ISD::UCOMI; 7756 CC = ISD::SETNE; 7757 break; 7758 } 7759 7760 SDValue LHS = Op.getOperand(1); 7761 SDValue RHS = Op.getOperand(2); 7762 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 7763 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 7764 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 7765 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7766 DAG.getConstant(X86CC, MVT::i8), Cond); 7767 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7768 } 7769 // ptest and testp intrinsics. The intrinsic these come from are designed to 7770 // return an integer value, not just an instruction so lower it to the ptest 7771 // or testp pattern and a setcc for the result. 7772 case Intrinsic::x86_sse41_ptestz: 7773 case Intrinsic::x86_sse41_ptestc: 7774 case Intrinsic::x86_sse41_ptestnzc: 7775 case Intrinsic::x86_avx_ptestz_256: 7776 case Intrinsic::x86_avx_ptestc_256: 7777 case Intrinsic::x86_avx_ptestnzc_256: 7778 case Intrinsic::x86_avx_vtestz_ps: 7779 case Intrinsic::x86_avx_vtestc_ps: 7780 case Intrinsic::x86_avx_vtestnzc_ps: 7781 case Intrinsic::x86_avx_vtestz_pd: 7782 case Intrinsic::x86_avx_vtestc_pd: 7783 case Intrinsic::x86_avx_vtestnzc_pd: 7784 case Intrinsic::x86_avx_vtestz_ps_256: 7785 case Intrinsic::x86_avx_vtestc_ps_256: 7786 case Intrinsic::x86_avx_vtestnzc_ps_256: 7787 case Intrinsic::x86_avx_vtestz_pd_256: 7788 case Intrinsic::x86_avx_vtestc_pd_256: 7789 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7790 bool IsTestPacked = false; 7791 unsigned X86CC = 0; 7792 switch (IntNo) { 7793 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7794 case Intrinsic::x86_avx_vtestz_ps: 7795 case Intrinsic::x86_avx_vtestz_pd: 7796 case Intrinsic::x86_avx_vtestz_ps_256: 7797 case Intrinsic::x86_avx_vtestz_pd_256: 7798 IsTestPacked = true; // Fallthrough 7799 case Intrinsic::x86_sse41_ptestz: 7800 case Intrinsic::x86_avx_ptestz_256: 7801 // ZF = 1 7802 X86CC = X86::COND_E; 7803 break; 7804 case Intrinsic::x86_avx_vtestc_ps: 7805 case Intrinsic::x86_avx_vtestc_pd: 7806 case Intrinsic::x86_avx_vtestc_ps_256: 7807 case Intrinsic::x86_avx_vtestc_pd_256: 7808 IsTestPacked = true; // Fallthrough 7809 case Intrinsic::x86_sse41_ptestc: 7810 case Intrinsic::x86_avx_ptestc_256: 7811 // CF = 1 7812 X86CC = X86::COND_B; 7813 break; 7814 case Intrinsic::x86_avx_vtestnzc_ps: 7815 case Intrinsic::x86_avx_vtestnzc_pd: 7816 case Intrinsic::x86_avx_vtestnzc_ps_256: 7817 case Intrinsic::x86_avx_vtestnzc_pd_256: 7818 IsTestPacked = true; // Fallthrough 7819 case Intrinsic::x86_sse41_ptestnzc: 7820 case Intrinsic::x86_avx_ptestnzc_256: 7821 // ZF and CF = 0 7822 X86CC = X86::COND_A; 7823 break; 7824 } 7825 7826 SDValue LHS = Op.getOperand(1); 7827 SDValue RHS = Op.getOperand(2); 7828 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7829 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7830 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7831 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7832 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7833 } 7834 7835 // Fix vector shift instructions where the last operand is a non-immediate 7836 // i32 value. 7837 case Intrinsic::x86_sse2_pslli_w: 7838 case Intrinsic::x86_sse2_pslli_d: 7839 case Intrinsic::x86_sse2_pslli_q: 7840 case Intrinsic::x86_sse2_psrli_w: 7841 case Intrinsic::x86_sse2_psrli_d: 7842 case Intrinsic::x86_sse2_psrli_q: 7843 case Intrinsic::x86_sse2_psrai_w: 7844 case Intrinsic::x86_sse2_psrai_d: 7845 case Intrinsic::x86_mmx_pslli_w: 7846 case Intrinsic::x86_mmx_pslli_d: 7847 case Intrinsic::x86_mmx_pslli_q: 7848 case Intrinsic::x86_mmx_psrli_w: 7849 case Intrinsic::x86_mmx_psrli_d: 7850 case Intrinsic::x86_mmx_psrli_q: 7851 case Intrinsic::x86_mmx_psrai_w: 7852 case Intrinsic::x86_mmx_psrai_d: { 7853 SDValue ShAmt = Op.getOperand(2); 7854 if (isa<ConstantSDNode>(ShAmt)) 7855 return SDValue(); 7856 7857 unsigned NewIntNo = 0; 7858 EVT ShAmtVT = MVT::v4i32; 7859 switch (IntNo) { 7860 case Intrinsic::x86_sse2_pslli_w: 7861 NewIntNo = Intrinsic::x86_sse2_psll_w; 7862 break; 7863 case Intrinsic::x86_sse2_pslli_d: 7864 NewIntNo = Intrinsic::x86_sse2_psll_d; 7865 break; 7866 case Intrinsic::x86_sse2_pslli_q: 7867 NewIntNo = Intrinsic::x86_sse2_psll_q; 7868 break; 7869 case Intrinsic::x86_sse2_psrli_w: 7870 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7871 break; 7872 case Intrinsic::x86_sse2_psrli_d: 7873 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7874 break; 7875 case Intrinsic::x86_sse2_psrli_q: 7876 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7877 break; 7878 case Intrinsic::x86_sse2_psrai_w: 7879 NewIntNo = Intrinsic::x86_sse2_psra_w; 7880 break; 7881 case Intrinsic::x86_sse2_psrai_d: 7882 NewIntNo = Intrinsic::x86_sse2_psra_d; 7883 break; 7884 default: { 7885 ShAmtVT = MVT::v2i32; 7886 switch (IntNo) { 7887 case Intrinsic::x86_mmx_pslli_w: 7888 NewIntNo = Intrinsic::x86_mmx_psll_w; 7889 break; 7890 case Intrinsic::x86_mmx_pslli_d: 7891 NewIntNo = Intrinsic::x86_mmx_psll_d; 7892 break; 7893 case Intrinsic::x86_mmx_pslli_q: 7894 NewIntNo = Intrinsic::x86_mmx_psll_q; 7895 break; 7896 case Intrinsic::x86_mmx_psrli_w: 7897 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7898 break; 7899 case Intrinsic::x86_mmx_psrli_d: 7900 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7901 break; 7902 case Intrinsic::x86_mmx_psrli_q: 7903 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7904 break; 7905 case Intrinsic::x86_mmx_psrai_w: 7906 NewIntNo = Intrinsic::x86_mmx_psra_w; 7907 break; 7908 case Intrinsic::x86_mmx_psrai_d: 7909 NewIntNo = Intrinsic::x86_mmx_psra_d; 7910 break; 7911 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7912 } 7913 break; 7914 } 7915 } 7916 7917 // The vector shift intrinsics with scalars uses 32b shift amounts but 7918 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7919 // to be zero. 7920 SDValue ShOps[4]; 7921 ShOps[0] = ShAmt; 7922 ShOps[1] = DAG.getConstant(0, MVT::i32); 7923 if (ShAmtVT == MVT::v4i32) { 7924 ShOps[2] = DAG.getUNDEF(MVT::i32); 7925 ShOps[3] = DAG.getUNDEF(MVT::i32); 7926 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7927 } else { 7928 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7929// FIXME this must be lowered to get rid of the invalid type. 7930 } 7931 7932 EVT VT = Op.getValueType(); 7933 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 7934 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7935 DAG.getConstant(NewIntNo, MVT::i32), 7936 Op.getOperand(1), ShAmt); 7937 } 7938 } 7939} 7940 7941SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7942 SelectionDAG &DAG) const { 7943 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7944 MFI->setReturnAddressIsTaken(true); 7945 7946 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7947 DebugLoc dl = Op.getDebugLoc(); 7948 7949 if (Depth > 0) { 7950 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7951 SDValue Offset = 7952 DAG.getConstant(TD->getPointerSize(), 7953 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7954 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7955 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7956 FrameAddr, Offset), 7957 MachinePointerInfo(), false, false, 0); 7958 } 7959 7960 // Just load the return address. 7961 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7962 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7963 RetAddrFI, MachinePointerInfo(), false, false, 0); 7964} 7965 7966SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7967 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7968 MFI->setFrameAddressIsTaken(true); 7969 7970 EVT VT = Op.getValueType(); 7971 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7972 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7973 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7974 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7975 while (Depth--) 7976 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 7977 MachinePointerInfo(), 7978 false, false, 0); 7979 return FrameAddr; 7980} 7981 7982SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7983 SelectionDAG &DAG) const { 7984 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7985} 7986 7987SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7988 MachineFunction &MF = DAG.getMachineFunction(); 7989 SDValue Chain = Op.getOperand(0); 7990 SDValue Offset = Op.getOperand(1); 7991 SDValue Handler = Op.getOperand(2); 7992 DebugLoc dl = Op.getDebugLoc(); 7993 7994 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 7995 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7996 getPointerTy()); 7997 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7998 7999 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 8000 DAG.getIntPtrConstant(TD->getPointerSize())); 8001 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 8002 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 8003 false, false, 0); 8004 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 8005 MF.getRegInfo().addLiveOut(StoreAddrReg); 8006 8007 return DAG.getNode(X86ISD::EH_RETURN, dl, 8008 MVT::Other, 8009 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 8010} 8011 8012SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 8013 SelectionDAG &DAG) const { 8014 SDValue Root = Op.getOperand(0); 8015 SDValue Trmp = Op.getOperand(1); // trampoline 8016 SDValue FPtr = Op.getOperand(2); // nested function 8017 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 8018 DebugLoc dl = Op.getDebugLoc(); 8019 8020 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8021 8022 if (Subtarget->is64Bit()) { 8023 SDValue OutChains[6]; 8024 8025 // Large code-model. 8026 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 8027 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 8028 8029 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 8030 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 8031 8032 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 8033 8034 // Load the pointer to the nested function into R11. 8035 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 8036 SDValue Addr = Trmp; 8037 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8038 Addr, MachinePointerInfo(TrmpAddr), 8039 false, false, 0); 8040 8041 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8042 DAG.getConstant(2, MVT::i64)); 8043 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 8044 MachinePointerInfo(TrmpAddr, 2), 8045 false, false, 2); 8046 8047 // Load the 'nest' parameter value into R10. 8048 // R10 is specified in X86CallingConv.td 8049 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 8050 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8051 DAG.getConstant(10, MVT::i64)); 8052 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8053 Addr, MachinePointerInfo(TrmpAddr, 10), 8054 false, false, 0); 8055 8056 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8057 DAG.getConstant(12, MVT::i64)); 8058 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 8059 MachinePointerInfo(TrmpAddr, 12), 8060 false, false, 2); 8061 8062 // Jump to the nested function. 8063 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 8064 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8065 DAG.getConstant(20, MVT::i64)); 8066 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8067 Addr, MachinePointerInfo(TrmpAddr, 20), 8068 false, false, 0); 8069 8070 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 8071 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8072 DAG.getConstant(22, MVT::i64)); 8073 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 8074 MachinePointerInfo(TrmpAddr, 22), 8075 false, false, 0); 8076 8077 SDValue Ops[] = 8078 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 8079 return DAG.getMergeValues(Ops, 2, dl); 8080 } else { 8081 const Function *Func = 8082 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 8083 CallingConv::ID CC = Func->getCallingConv(); 8084 unsigned NestReg; 8085 8086 switch (CC) { 8087 default: 8088 llvm_unreachable("Unsupported calling convention"); 8089 case CallingConv::C: 8090 case CallingConv::X86_StdCall: { 8091 // Pass 'nest' parameter in ECX. 8092 // Must be kept in sync with X86CallingConv.td 8093 NestReg = X86::ECX; 8094 8095 // Check that ECX wasn't needed by an 'inreg' parameter. 8096 const FunctionType *FTy = Func->getFunctionType(); 8097 const AttrListPtr &Attrs = Func->getAttributes(); 8098 8099 if (!Attrs.isEmpty() && !Func->isVarArg()) { 8100 unsigned InRegCount = 0; 8101 unsigned Idx = 1; 8102 8103 for (FunctionType::param_iterator I = FTy->param_begin(), 8104 E = FTy->param_end(); I != E; ++I, ++Idx) 8105 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 8106 // FIXME: should only count parameters that are lowered to integers. 8107 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 8108 8109 if (InRegCount > 2) { 8110 report_fatal_error("Nest register in use - reduce number of inreg" 8111 " parameters!"); 8112 } 8113 } 8114 break; 8115 } 8116 case CallingConv::X86_FastCall: 8117 case CallingConv::X86_ThisCall: 8118 case CallingConv::Fast: 8119 // Pass 'nest' parameter in EAX. 8120 // Must be kept in sync with X86CallingConv.td 8121 NestReg = X86::EAX; 8122 break; 8123 } 8124 8125 SDValue OutChains[4]; 8126 SDValue Addr, Disp; 8127 8128 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8129 DAG.getConstant(10, MVT::i32)); 8130 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 8131 8132 // This is storing the opcode for MOV32ri. 8133 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 8134 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 8135 OutChains[0] = DAG.getStore(Root, dl, 8136 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 8137 Trmp, MachinePointerInfo(TrmpAddr), 8138 false, false, 0); 8139 8140 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8141 DAG.getConstant(1, MVT::i32)); 8142 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 8143 MachinePointerInfo(TrmpAddr, 1), 8144 false, false, 1); 8145 8146 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 8147 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8148 DAG.getConstant(5, MVT::i32)); 8149 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 8150 MachinePointerInfo(TrmpAddr, 5), 8151 false, false, 1); 8152 8153 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8154 DAG.getConstant(6, MVT::i32)); 8155 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 8156 MachinePointerInfo(TrmpAddr, 6), 8157 false, false, 1); 8158 8159 SDValue Ops[] = 8160 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 8161 return DAG.getMergeValues(Ops, 2, dl); 8162 } 8163} 8164 8165SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 8166 SelectionDAG &DAG) const { 8167 /* 8168 The rounding mode is in bits 11:10 of FPSR, and has the following 8169 settings: 8170 00 Round to nearest 8171 01 Round to -inf 8172 10 Round to +inf 8173 11 Round to 0 8174 8175 FLT_ROUNDS, on the other hand, expects the following: 8176 -1 Undefined 8177 0 Round to 0 8178 1 Round to nearest 8179 2 Round to +inf 8180 3 Round to -inf 8181 8182 To perform the conversion, we do: 8183 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 8184 */ 8185 8186 MachineFunction &MF = DAG.getMachineFunction(); 8187 const TargetMachine &TM = MF.getTarget(); 8188 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 8189 unsigned StackAlignment = TFI.getStackAlignment(); 8190 EVT VT = Op.getValueType(); 8191 DebugLoc DL = Op.getDebugLoc(); 8192 8193 // Save FP Control Word to stack slot 8194 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 8195 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8196 8197 8198 MachineMemOperand *MMO = 8199 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8200 MachineMemOperand::MOStore, 2, 2); 8201 8202 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 8203 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 8204 DAG.getVTList(MVT::Other), 8205 Ops, 2, MVT::i16, MMO); 8206 8207 // Load FP Control Word from stack slot 8208 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 8209 MachinePointerInfo(), false, false, 0); 8210 8211 // Transform as necessary 8212 SDValue CWD1 = 8213 DAG.getNode(ISD::SRL, DL, MVT::i16, 8214 DAG.getNode(ISD::AND, DL, MVT::i16, 8215 CWD, DAG.getConstant(0x800, MVT::i16)), 8216 DAG.getConstant(11, MVT::i8)); 8217 SDValue CWD2 = 8218 DAG.getNode(ISD::SRL, DL, MVT::i16, 8219 DAG.getNode(ISD::AND, DL, MVT::i16, 8220 CWD, DAG.getConstant(0x400, MVT::i16)), 8221 DAG.getConstant(9, MVT::i8)); 8222 8223 SDValue RetVal = 8224 DAG.getNode(ISD::AND, DL, MVT::i16, 8225 DAG.getNode(ISD::ADD, DL, MVT::i16, 8226 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 8227 DAG.getConstant(1, MVT::i16)), 8228 DAG.getConstant(3, MVT::i16)); 8229 8230 8231 return DAG.getNode((VT.getSizeInBits() < 16 ? 8232 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 8233} 8234 8235SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 8236 EVT VT = Op.getValueType(); 8237 EVT OpVT = VT; 8238 unsigned NumBits = VT.getSizeInBits(); 8239 DebugLoc dl = Op.getDebugLoc(); 8240 8241 Op = Op.getOperand(0); 8242 if (VT == MVT::i8) { 8243 // Zero extend to i32 since there is not an i8 bsr. 8244 OpVT = MVT::i32; 8245 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8246 } 8247 8248 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 8249 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8250 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 8251 8252 // If src is zero (i.e. bsr sets ZF), returns NumBits. 8253 SDValue Ops[] = { 8254 Op, 8255 DAG.getConstant(NumBits+NumBits-1, OpVT), 8256 DAG.getConstant(X86::COND_E, MVT::i8), 8257 Op.getValue(1) 8258 }; 8259 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8260 8261 // Finally xor with NumBits-1. 8262 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 8263 8264 if (VT == MVT::i8) 8265 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8266 return Op; 8267} 8268 8269SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 8270 EVT VT = Op.getValueType(); 8271 EVT OpVT = VT; 8272 unsigned NumBits = VT.getSizeInBits(); 8273 DebugLoc dl = Op.getDebugLoc(); 8274 8275 Op = Op.getOperand(0); 8276 if (VT == MVT::i8) { 8277 OpVT = MVT::i32; 8278 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8279 } 8280 8281 // Issue a bsf (scan bits forward) which also sets EFLAGS. 8282 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8283 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 8284 8285 // If src is zero (i.e. bsf sets ZF), returns NumBits. 8286 SDValue Ops[] = { 8287 Op, 8288 DAG.getConstant(NumBits, OpVT), 8289 DAG.getConstant(X86::COND_E, MVT::i8), 8290 Op.getValue(1) 8291 }; 8292 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8293 8294 if (VT == MVT::i8) 8295 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8296 return Op; 8297} 8298 8299SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 8300 EVT VT = Op.getValueType(); 8301 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 8302 DebugLoc dl = Op.getDebugLoc(); 8303 8304 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 8305 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 8306 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 8307 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 8308 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 8309 // 8310 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 8311 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 8312 // return AloBlo + AloBhi + AhiBlo; 8313 8314 SDValue A = Op.getOperand(0); 8315 SDValue B = Op.getOperand(1); 8316 8317 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8318 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8319 A, DAG.getConstant(32, MVT::i32)); 8320 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8321 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8322 B, DAG.getConstant(32, MVT::i32)); 8323 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8324 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8325 A, B); 8326 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8327 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8328 A, Bhi); 8329 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8330 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8331 Ahi, B); 8332 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8333 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8334 AloBhi, DAG.getConstant(32, MVT::i32)); 8335 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8336 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8337 AhiBlo, DAG.getConstant(32, MVT::i32)); 8338 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 8339 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 8340 return Res; 8341} 8342 8343SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 8344 EVT VT = Op.getValueType(); 8345 DebugLoc dl = Op.getDebugLoc(); 8346 SDValue R = Op.getOperand(0); 8347 8348 LLVMContext *Context = DAG.getContext(); 8349 8350 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 8351 8352 if (VT == MVT::v4i32) { 8353 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8354 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8355 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 8356 8357 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 8358 8359 std::vector<Constant*> CV(4, CI); 8360 Constant *C = ConstantVector::get(CV); 8361 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8362 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8363 MachinePointerInfo::getConstantPool(), 8364 false, false, 16); 8365 8366 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 8367 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 8368 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 8369 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 8370 } 8371 if (VT == MVT::v16i8) { 8372 // a = a << 5; 8373 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8374 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8375 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 8376 8377 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 8378 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 8379 8380 std::vector<Constant*> CVM1(16, CM1); 8381 std::vector<Constant*> CVM2(16, CM2); 8382 Constant *C = ConstantVector::get(CVM1); 8383 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8384 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8385 MachinePointerInfo::getConstantPool(), 8386 false, false, 16); 8387 8388 // r = pblendv(r, psllw(r & (char16)15, 4), a); 8389 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8390 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8391 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8392 DAG.getConstant(4, MVT::i32)); 8393 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8394 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8395 R, M, Op); 8396 // a += a 8397 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8398 8399 C = ConstantVector::get(CVM2); 8400 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8401 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8402 MachinePointerInfo::getConstantPool(), 8403 false, false, 16); 8404 8405 // r = pblendv(r, psllw(r & (char16)63, 2), a); 8406 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8407 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8408 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8409 DAG.getConstant(2, MVT::i32)); 8410 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8411 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8412 R, M, Op); 8413 // a += a 8414 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8415 8416 // return pblendv(r, r+r, a); 8417 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8418 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8419 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 8420 return R; 8421 } 8422 return SDValue(); 8423} 8424 8425SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 8426 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 8427 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 8428 // looks for this combo and may remove the "setcc" instruction if the "setcc" 8429 // has only one use. 8430 SDNode *N = Op.getNode(); 8431 SDValue LHS = N->getOperand(0); 8432 SDValue RHS = N->getOperand(1); 8433 unsigned BaseOp = 0; 8434 unsigned Cond = 0; 8435 DebugLoc DL = Op.getDebugLoc(); 8436 switch (Op.getOpcode()) { 8437 default: llvm_unreachable("Unknown ovf instruction!"); 8438 case ISD::SADDO: 8439 // A subtract of one will be selected as a INC. Note that INC doesn't 8440 // set CF, so we can't do this for UADDO. 8441 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8442 if (C->getAPIntValue() == 1) { 8443 BaseOp = X86ISD::INC; 8444 Cond = X86::COND_O; 8445 break; 8446 } 8447 BaseOp = X86ISD::ADD; 8448 Cond = X86::COND_O; 8449 break; 8450 case ISD::UADDO: 8451 BaseOp = X86ISD::ADD; 8452 Cond = X86::COND_B; 8453 break; 8454 case ISD::SSUBO: 8455 // A subtract of one will be selected as a DEC. Note that DEC doesn't 8456 // set CF, so we can't do this for USUBO. 8457 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8458 if (C->getAPIntValue() == 1) { 8459 BaseOp = X86ISD::DEC; 8460 Cond = X86::COND_O; 8461 break; 8462 } 8463 BaseOp = X86ISD::SUB; 8464 Cond = X86::COND_O; 8465 break; 8466 case ISD::USUBO: 8467 BaseOp = X86ISD::SUB; 8468 Cond = X86::COND_B; 8469 break; 8470 case ISD::SMULO: 8471 BaseOp = X86ISD::SMUL; 8472 Cond = X86::COND_O; 8473 break; 8474 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 8475 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 8476 MVT::i32); 8477 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 8478 8479 SDValue SetCC = 8480 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8481 DAG.getConstant(X86::COND_O, MVT::i32), 8482 SDValue(Sum.getNode(), 2)); 8483 8484 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8485 return Sum; 8486 } 8487 } 8488 8489 // Also sets EFLAGS. 8490 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 8491 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 8492 8493 SDValue SetCC = 8494 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 8495 DAG.getConstant(Cond, MVT::i32), 8496 SDValue(Sum.getNode(), 1)); 8497 8498 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8499 return Sum; 8500} 8501 8502SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 8503 DebugLoc dl = Op.getDebugLoc(); 8504 8505 if (!Subtarget->hasSSE2()) { 8506 SDValue Chain = Op.getOperand(0); 8507 SDValue Zero = DAG.getConstant(0, 8508 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8509 SDValue Ops[] = { 8510 DAG.getRegister(X86::ESP, MVT::i32), // Base 8511 DAG.getTargetConstant(1, MVT::i8), // Scale 8512 DAG.getRegister(0, MVT::i32), // Index 8513 DAG.getTargetConstant(0, MVT::i32), // Disp 8514 DAG.getRegister(0, MVT::i32), // Segment. 8515 Zero, 8516 Chain 8517 }; 8518 SDNode *Res = 8519 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 8520 array_lengthof(Ops)); 8521 return SDValue(Res, 0); 8522 } 8523 8524 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 8525 if (!isDev) 8526 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 8527 8528 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8529 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8530 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 8531 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 8532 8533 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 8534 if (!Op1 && !Op2 && !Op3 && Op4) 8535 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 8536 8537 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 8538 if (Op1 && !Op2 && !Op3 && !Op4) 8539 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 8540 8541 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 8542 // (MFENCE)>; 8543 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 8544} 8545 8546SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 8547 EVT T = Op.getValueType(); 8548 DebugLoc DL = Op.getDebugLoc(); 8549 unsigned Reg = 0; 8550 unsigned size = 0; 8551 switch(T.getSimpleVT().SimpleTy) { 8552 default: 8553 assert(false && "Invalid value type!"); 8554 case MVT::i8: Reg = X86::AL; size = 1; break; 8555 case MVT::i16: Reg = X86::AX; size = 2; break; 8556 case MVT::i32: Reg = X86::EAX; size = 4; break; 8557 case MVT::i64: 8558 assert(Subtarget->is64Bit() && "Node not type legal!"); 8559 Reg = X86::RAX; size = 8; 8560 break; 8561 } 8562 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 8563 Op.getOperand(2), SDValue()); 8564 SDValue Ops[] = { cpIn.getValue(0), 8565 Op.getOperand(1), 8566 Op.getOperand(3), 8567 DAG.getTargetConstant(size, MVT::i8), 8568 cpIn.getValue(1) }; 8569 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8570 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 8571 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 8572 Ops, 5, T, MMO); 8573 SDValue cpOut = 8574 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 8575 return cpOut; 8576} 8577 8578SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 8579 SelectionDAG &DAG) const { 8580 assert(Subtarget->is64Bit() && "Result not type legalized?"); 8581 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8582 SDValue TheChain = Op.getOperand(0); 8583 DebugLoc dl = Op.getDebugLoc(); 8584 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8585 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 8586 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 8587 rax.getValue(2)); 8588 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 8589 DAG.getConstant(32, MVT::i8)); 8590 SDValue Ops[] = { 8591 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 8592 rdx.getValue(1) 8593 }; 8594 return DAG.getMergeValues(Ops, 2, dl); 8595} 8596 8597SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 8598 SelectionDAG &DAG) const { 8599 EVT SrcVT = Op.getOperand(0).getValueType(); 8600 EVT DstVT = Op.getValueType(); 8601 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 8602 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 8603 assert((DstVT == MVT::i64 || 8604 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 8605 "Unexpected custom BITCAST"); 8606 // i64 <=> MMX conversions are Legal. 8607 if (SrcVT==MVT::i64 && DstVT.isVector()) 8608 return Op; 8609 if (DstVT==MVT::i64 && SrcVT.isVector()) 8610 return Op; 8611 // MMX <=> MMX conversions are Legal. 8612 if (SrcVT.isVector() && DstVT.isVector()) 8613 return Op; 8614 // All other conversions need to be expanded. 8615 return SDValue(); 8616} 8617SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 8618 SDNode *Node = Op.getNode(); 8619 DebugLoc dl = Node->getDebugLoc(); 8620 EVT T = Node->getValueType(0); 8621 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 8622 DAG.getConstant(0, T), Node->getOperand(2)); 8623 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 8624 cast<AtomicSDNode>(Node)->getMemoryVT(), 8625 Node->getOperand(0), 8626 Node->getOperand(1), negOp, 8627 cast<AtomicSDNode>(Node)->getSrcValue(), 8628 cast<AtomicSDNode>(Node)->getAlignment()); 8629} 8630 8631/// LowerOperation - Provide custom lowering hooks for some operations. 8632/// 8633SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 8634 switch (Op.getOpcode()) { 8635 default: llvm_unreachable("Should not custom lower this!"); 8636 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 8637 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 8638 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 8639 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 8640 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 8641 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8642 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8643 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8644 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 8645 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 8646 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 8647 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 8648 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 8649 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 8650 case ISD::SHL_PARTS: 8651 case ISD::SRA_PARTS: 8652 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 8653 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 8654 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 8655 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 8656 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 8657 case ISD::FABS: return LowerFABS(Op, DAG); 8658 case ISD::FNEG: return LowerFNEG(Op, DAG); 8659 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 8660 case ISD::SETCC: return LowerSETCC(Op, DAG); 8661 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 8662 case ISD::SELECT: return LowerSELECT(Op, DAG); 8663 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 8664 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 8665 case ISD::VASTART: return LowerVASTART(Op, DAG); 8666 case ISD::VAARG: return LowerVAARG(Op, DAG); 8667 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 8668 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 8669 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 8670 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 8671 case ISD::FRAME_TO_ARGS_OFFSET: 8672 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 8673 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 8674 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 8675 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 8676 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8677 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 8678 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 8679 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 8680 case ISD::SHL: return LowerSHL(Op, DAG); 8681 case ISD::SADDO: 8682 case ISD::UADDO: 8683 case ISD::SSUBO: 8684 case ISD::USUBO: 8685 case ISD::SMULO: 8686 case ISD::UMULO: return LowerXALUO(Op, DAG); 8687 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 8688 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 8689 } 8690} 8691 8692void X86TargetLowering:: 8693ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 8694 SelectionDAG &DAG, unsigned NewOp) const { 8695 EVT T = Node->getValueType(0); 8696 DebugLoc dl = Node->getDebugLoc(); 8697 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 8698 8699 SDValue Chain = Node->getOperand(0); 8700 SDValue In1 = Node->getOperand(1); 8701 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8702 Node->getOperand(2), DAG.getIntPtrConstant(0)); 8703 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8704 Node->getOperand(2), DAG.getIntPtrConstant(1)); 8705 SDValue Ops[] = { Chain, In1, In2L, In2H }; 8706 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 8707 SDValue Result = 8708 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 8709 cast<MemSDNode>(Node)->getMemOperand()); 8710 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 8711 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8712 Results.push_back(Result.getValue(2)); 8713} 8714 8715/// ReplaceNodeResults - Replace a node with an illegal result type 8716/// with a new node built out of custom code. 8717void X86TargetLowering::ReplaceNodeResults(SDNode *N, 8718 SmallVectorImpl<SDValue>&Results, 8719 SelectionDAG &DAG) const { 8720 DebugLoc dl = N->getDebugLoc(); 8721 switch (N->getOpcode()) { 8722 default: 8723 assert(false && "Do not know how to custom type legalize this operation!"); 8724 return; 8725 case ISD::FP_TO_SINT: { 8726 std::pair<SDValue,SDValue> Vals = 8727 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 8728 SDValue FIST = Vals.first, StackSlot = Vals.second; 8729 if (FIST.getNode() != 0) { 8730 EVT VT = N->getValueType(0); 8731 // Return a load from the stack slot. 8732 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 8733 MachinePointerInfo(), false, false, 0)); 8734 } 8735 return; 8736 } 8737 case ISD::READCYCLECOUNTER: { 8738 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8739 SDValue TheChain = N->getOperand(0); 8740 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8741 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 8742 rd.getValue(1)); 8743 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 8744 eax.getValue(2)); 8745 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 8746 SDValue Ops[] = { eax, edx }; 8747 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 8748 Results.push_back(edx.getValue(1)); 8749 return; 8750 } 8751 case ISD::ATOMIC_CMP_SWAP: { 8752 EVT T = N->getValueType(0); 8753 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 8754 SDValue cpInL, cpInH; 8755 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8756 DAG.getConstant(0, MVT::i32)); 8757 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8758 DAG.getConstant(1, MVT::i32)); 8759 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 8760 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 8761 cpInL.getValue(1)); 8762 SDValue swapInL, swapInH; 8763 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8764 DAG.getConstant(0, MVT::i32)); 8765 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8766 DAG.getConstant(1, MVT::i32)); 8767 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 8768 cpInH.getValue(1)); 8769 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 8770 swapInL.getValue(1)); 8771 SDValue Ops[] = { swapInH.getValue(0), 8772 N->getOperand(1), 8773 swapInH.getValue(1) }; 8774 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8775 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 8776 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 8777 Ops, 3, T, MMO); 8778 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 8779 MVT::i32, Result.getValue(1)); 8780 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 8781 MVT::i32, cpOutL.getValue(2)); 8782 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 8783 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8784 Results.push_back(cpOutH.getValue(1)); 8785 return; 8786 } 8787 case ISD::ATOMIC_LOAD_ADD: 8788 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 8789 return; 8790 case ISD::ATOMIC_LOAD_AND: 8791 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 8792 return; 8793 case ISD::ATOMIC_LOAD_NAND: 8794 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 8795 return; 8796 case ISD::ATOMIC_LOAD_OR: 8797 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 8798 return; 8799 case ISD::ATOMIC_LOAD_SUB: 8800 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8801 return; 8802 case ISD::ATOMIC_LOAD_XOR: 8803 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8804 return; 8805 case ISD::ATOMIC_SWAP: 8806 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8807 return; 8808 } 8809} 8810 8811const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8812 switch (Opcode) { 8813 default: return NULL; 8814 case X86ISD::BSF: return "X86ISD::BSF"; 8815 case X86ISD::BSR: return "X86ISD::BSR"; 8816 case X86ISD::SHLD: return "X86ISD::SHLD"; 8817 case X86ISD::SHRD: return "X86ISD::SHRD"; 8818 case X86ISD::FAND: return "X86ISD::FAND"; 8819 case X86ISD::FOR: return "X86ISD::FOR"; 8820 case X86ISD::FXOR: return "X86ISD::FXOR"; 8821 case X86ISD::FSRL: return "X86ISD::FSRL"; 8822 case X86ISD::FILD: return "X86ISD::FILD"; 8823 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8824 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8825 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8826 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8827 case X86ISD::FLD: return "X86ISD::FLD"; 8828 case X86ISD::FST: return "X86ISD::FST"; 8829 case X86ISD::CALL: return "X86ISD::CALL"; 8830 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8831 case X86ISD::BT: return "X86ISD::BT"; 8832 case X86ISD::CMP: return "X86ISD::CMP"; 8833 case X86ISD::COMI: return "X86ISD::COMI"; 8834 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8835 case X86ISD::SETCC: return "X86ISD::SETCC"; 8836 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8837 case X86ISD::CMOV: return "X86ISD::CMOV"; 8838 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8839 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8840 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8841 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8842 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8843 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8844 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8845 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8846 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8847 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8848 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8849 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8850 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8851 case X86ISD::PANDN: return "X86ISD::PANDN"; 8852 case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; 8853 case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; 8854 case X86ISD::PSIGND: return "X86ISD::PSIGND"; 8855 case X86ISD::FMAX: return "X86ISD::FMAX"; 8856 case X86ISD::FMIN: return "X86ISD::FMIN"; 8857 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8858 case X86ISD::FRCP: return "X86ISD::FRCP"; 8859 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8860 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8861 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8862 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8863 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8864 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8865 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8866 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8867 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8868 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8869 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8870 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8871 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8872 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8873 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8874 case X86ISD::VSHL: return "X86ISD::VSHL"; 8875 case X86ISD::VSRL: return "X86ISD::VSRL"; 8876 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8877 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8878 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8879 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8880 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8881 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8882 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8883 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8884 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8885 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8886 case X86ISD::ADD: return "X86ISD::ADD"; 8887 case X86ISD::SUB: return "X86ISD::SUB"; 8888 case X86ISD::SMUL: return "X86ISD::SMUL"; 8889 case X86ISD::UMUL: return "X86ISD::UMUL"; 8890 case X86ISD::INC: return "X86ISD::INC"; 8891 case X86ISD::DEC: return "X86ISD::DEC"; 8892 case X86ISD::OR: return "X86ISD::OR"; 8893 case X86ISD::XOR: return "X86ISD::XOR"; 8894 case X86ISD::AND: return "X86ISD::AND"; 8895 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8896 case X86ISD::PTEST: return "X86ISD::PTEST"; 8897 case X86ISD::TESTP: return "X86ISD::TESTP"; 8898 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 8899 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 8900 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 8901 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 8902 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 8903 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 8904 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 8905 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 8906 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 8907 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 8908 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 8909 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 8910 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 8911 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 8912 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 8913 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 8914 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 8915 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 8916 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 8917 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 8918 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 8919 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 8920 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 8921 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 8922 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 8923 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 8924 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 8925 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 8926 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 8927 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 8928 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 8929 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 8930 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 8931 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 8932 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 8933 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 8934 } 8935} 8936 8937// isLegalAddressingMode - Return true if the addressing mode represented 8938// by AM is legal for this target, for a load/store of the specified type. 8939bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 8940 const Type *Ty) const { 8941 // X86 supports extremely general addressing modes. 8942 CodeModel::Model M = getTargetMachine().getCodeModel(); 8943 Reloc::Model R = getTargetMachine().getRelocationModel(); 8944 8945 // X86 allows a sign-extended 32-bit immediate field as a displacement. 8946 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 8947 return false; 8948 8949 if (AM.BaseGV) { 8950 unsigned GVFlags = 8951 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 8952 8953 // If a reference to this global requires an extra load, we can't fold it. 8954 if (isGlobalStubReference(GVFlags)) 8955 return false; 8956 8957 // If BaseGV requires a register for the PIC base, we cannot also have a 8958 // BaseReg specified. 8959 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 8960 return false; 8961 8962 // If lower 4G is not available, then we must use rip-relative addressing. 8963 if ((M != CodeModel::Small || R != Reloc::Static) && 8964 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 8965 return false; 8966 } 8967 8968 switch (AM.Scale) { 8969 case 0: 8970 case 1: 8971 case 2: 8972 case 4: 8973 case 8: 8974 // These scales always work. 8975 break; 8976 case 3: 8977 case 5: 8978 case 9: 8979 // These scales are formed with basereg+scalereg. Only accept if there is 8980 // no basereg yet. 8981 if (AM.HasBaseReg) 8982 return false; 8983 break; 8984 default: // Other stuff never works. 8985 return false; 8986 } 8987 8988 return true; 8989} 8990 8991 8992bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 8993 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8994 return false; 8995 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8996 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8997 if (NumBits1 <= NumBits2) 8998 return false; 8999 return true; 9000} 9001 9002bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 9003 if (!VT1.isInteger() || !VT2.isInteger()) 9004 return false; 9005 unsigned NumBits1 = VT1.getSizeInBits(); 9006 unsigned NumBits2 = VT2.getSizeInBits(); 9007 if (NumBits1 <= NumBits2) 9008 return false; 9009 return true; 9010} 9011 9012bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 9013 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9014 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 9015} 9016 9017bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 9018 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9019 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 9020} 9021 9022bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 9023 // i16 instructions are longer (0x66 prefix) and potentially slower. 9024 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 9025} 9026 9027/// isShuffleMaskLegal - Targets can use this to indicate that they only 9028/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 9029/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 9030/// are assumed to be legal. 9031bool 9032X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 9033 EVT VT) const { 9034 // Very little shuffling can be done for 64-bit vectors right now. 9035 if (VT.getSizeInBits() == 64) 9036 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 9037 9038 // FIXME: pshufb, blends, shifts. 9039 return (VT.getVectorNumElements() == 2 || 9040 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 9041 isMOVLMask(M, VT) || 9042 isSHUFPMask(M, VT) || 9043 isPSHUFDMask(M, VT) || 9044 isPSHUFHWMask(M, VT) || 9045 isPSHUFLWMask(M, VT) || 9046 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 9047 isUNPCKLMask(M, VT) || 9048 isUNPCKHMask(M, VT) || 9049 isUNPCKL_v_undef_Mask(M, VT) || 9050 isUNPCKH_v_undef_Mask(M, VT)); 9051} 9052 9053bool 9054X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 9055 EVT VT) const { 9056 unsigned NumElts = VT.getVectorNumElements(); 9057 // FIXME: This collection of masks seems suspect. 9058 if (NumElts == 2) 9059 return true; 9060 if (NumElts == 4 && VT.getSizeInBits() == 128) { 9061 return (isMOVLMask(Mask, VT) || 9062 isCommutedMOVLMask(Mask, VT, true) || 9063 isSHUFPMask(Mask, VT) || 9064 isCommutedSHUFPMask(Mask, VT)); 9065 } 9066 return false; 9067} 9068 9069//===----------------------------------------------------------------------===// 9070// X86 Scheduler Hooks 9071//===----------------------------------------------------------------------===// 9072 9073// private utility function 9074MachineBasicBlock * 9075X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 9076 MachineBasicBlock *MBB, 9077 unsigned regOpc, 9078 unsigned immOpc, 9079 unsigned LoadOpc, 9080 unsigned CXchgOpc, 9081 unsigned notOpc, 9082 unsigned EAXreg, 9083 TargetRegisterClass *RC, 9084 bool invSrc) const { 9085 // For the atomic bitwise operator, we generate 9086 // thisMBB: 9087 // newMBB: 9088 // ld t1 = [bitinstr.addr] 9089 // op t2 = t1, [bitinstr.val] 9090 // mov EAX = t1 9091 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9092 // bz newMBB 9093 // fallthrough -->nextMBB 9094 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9095 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9096 MachineFunction::iterator MBBIter = MBB; 9097 ++MBBIter; 9098 9099 /// First build the CFG 9100 MachineFunction *F = MBB->getParent(); 9101 MachineBasicBlock *thisMBB = MBB; 9102 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9103 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9104 F->insert(MBBIter, newMBB); 9105 F->insert(MBBIter, nextMBB); 9106 9107 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9108 nextMBB->splice(nextMBB->begin(), thisMBB, 9109 llvm::next(MachineBasicBlock::iterator(bInstr)), 9110 thisMBB->end()); 9111 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9112 9113 // Update thisMBB to fall through to newMBB 9114 thisMBB->addSuccessor(newMBB); 9115 9116 // newMBB jumps to itself and fall through to nextMBB 9117 newMBB->addSuccessor(nextMBB); 9118 newMBB->addSuccessor(newMBB); 9119 9120 // Insert instructions into newMBB based on incoming instruction 9121 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9122 "unexpected number of operands"); 9123 DebugLoc dl = bInstr->getDebugLoc(); 9124 MachineOperand& destOper = bInstr->getOperand(0); 9125 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9126 int numArgs = bInstr->getNumOperands() - 1; 9127 for (int i=0; i < numArgs; ++i) 9128 argOpers[i] = &bInstr->getOperand(i+1); 9129 9130 // x86 address has 4 operands: base, index, scale, and displacement 9131 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9132 int valArgIndx = lastAddrIndx + 1; 9133 9134 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9135 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 9136 for (int i=0; i <= lastAddrIndx; ++i) 9137 (*MIB).addOperand(*argOpers[i]); 9138 9139 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 9140 if (invSrc) { 9141 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 9142 } 9143 else 9144 tt = t1; 9145 9146 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9147 assert((argOpers[valArgIndx]->isReg() || 9148 argOpers[valArgIndx]->isImm()) && 9149 "invalid operand"); 9150 if (argOpers[valArgIndx]->isReg()) 9151 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 9152 else 9153 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 9154 MIB.addReg(tt); 9155 (*MIB).addOperand(*argOpers[valArgIndx]); 9156 9157 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 9158 MIB.addReg(t1); 9159 9160 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 9161 for (int i=0; i <= lastAddrIndx; ++i) 9162 (*MIB).addOperand(*argOpers[i]); 9163 MIB.addReg(t2); 9164 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9165 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9166 bInstr->memoperands_end()); 9167 9168 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9169 MIB.addReg(EAXreg); 9170 9171 // insert branch 9172 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9173 9174 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9175 return nextMBB; 9176} 9177 9178// private utility function: 64 bit atomics on 32 bit host. 9179MachineBasicBlock * 9180X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 9181 MachineBasicBlock *MBB, 9182 unsigned regOpcL, 9183 unsigned regOpcH, 9184 unsigned immOpcL, 9185 unsigned immOpcH, 9186 bool invSrc) const { 9187 // For the atomic bitwise operator, we generate 9188 // thisMBB (instructions are in pairs, except cmpxchg8b) 9189 // ld t1,t2 = [bitinstr.addr] 9190 // newMBB: 9191 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 9192 // op t5, t6 <- out1, out2, [bitinstr.val] 9193 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 9194 // mov ECX, EBX <- t5, t6 9195 // mov EAX, EDX <- t1, t2 9196 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 9197 // mov t3, t4 <- EAX, EDX 9198 // bz newMBB 9199 // result in out1, out2 9200 // fallthrough -->nextMBB 9201 9202 const TargetRegisterClass *RC = X86::GR32RegisterClass; 9203 const unsigned LoadOpc = X86::MOV32rm; 9204 const unsigned NotOpc = X86::NOT32r; 9205 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9206 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9207 MachineFunction::iterator MBBIter = MBB; 9208 ++MBBIter; 9209 9210 /// First build the CFG 9211 MachineFunction *F = MBB->getParent(); 9212 MachineBasicBlock *thisMBB = MBB; 9213 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9214 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9215 F->insert(MBBIter, newMBB); 9216 F->insert(MBBIter, nextMBB); 9217 9218 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9219 nextMBB->splice(nextMBB->begin(), thisMBB, 9220 llvm::next(MachineBasicBlock::iterator(bInstr)), 9221 thisMBB->end()); 9222 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9223 9224 // Update thisMBB to fall through to newMBB 9225 thisMBB->addSuccessor(newMBB); 9226 9227 // newMBB jumps to itself and fall through to nextMBB 9228 newMBB->addSuccessor(nextMBB); 9229 newMBB->addSuccessor(newMBB); 9230 9231 DebugLoc dl = bInstr->getDebugLoc(); 9232 // Insert instructions into newMBB based on incoming instruction 9233 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 9234 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 9235 "unexpected number of operands"); 9236 MachineOperand& dest1Oper = bInstr->getOperand(0); 9237 MachineOperand& dest2Oper = bInstr->getOperand(1); 9238 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9239 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 9240 argOpers[i] = &bInstr->getOperand(i+2); 9241 9242 // We use some of the operands multiple times, so conservatively just 9243 // clear any kill flags that might be present. 9244 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 9245 argOpers[i]->setIsKill(false); 9246 } 9247 9248 // x86 address has 5 operands: base, index, scale, displacement, and segment. 9249 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9250 9251 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9252 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 9253 for (int i=0; i <= lastAddrIndx; ++i) 9254 (*MIB).addOperand(*argOpers[i]); 9255 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9256 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 9257 // add 4 to displacement. 9258 for (int i=0; i <= lastAddrIndx-2; ++i) 9259 (*MIB).addOperand(*argOpers[i]); 9260 MachineOperand newOp3 = *(argOpers[3]); 9261 if (newOp3.isImm()) 9262 newOp3.setImm(newOp3.getImm()+4); 9263 else 9264 newOp3.setOffset(newOp3.getOffset()+4); 9265 (*MIB).addOperand(newOp3); 9266 (*MIB).addOperand(*argOpers[lastAddrIndx]); 9267 9268 // t3/4 are defined later, at the bottom of the loop 9269 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 9270 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 9271 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 9272 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 9273 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 9274 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 9275 9276 // The subsequent operations should be using the destination registers of 9277 //the PHI instructions. 9278 if (invSrc) { 9279 t1 = F->getRegInfo().createVirtualRegister(RC); 9280 t2 = F->getRegInfo().createVirtualRegister(RC); 9281 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 9282 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 9283 } else { 9284 t1 = dest1Oper.getReg(); 9285 t2 = dest2Oper.getReg(); 9286 } 9287 9288 int valArgIndx = lastAddrIndx + 1; 9289 assert((argOpers[valArgIndx]->isReg() || 9290 argOpers[valArgIndx]->isImm()) && 9291 "invalid operand"); 9292 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 9293 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 9294 if (argOpers[valArgIndx]->isReg()) 9295 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 9296 else 9297 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 9298 if (regOpcL != X86::MOV32rr) 9299 MIB.addReg(t1); 9300 (*MIB).addOperand(*argOpers[valArgIndx]); 9301 assert(argOpers[valArgIndx + 1]->isReg() == 9302 argOpers[valArgIndx]->isReg()); 9303 assert(argOpers[valArgIndx + 1]->isImm() == 9304 argOpers[valArgIndx]->isImm()); 9305 if (argOpers[valArgIndx + 1]->isReg()) 9306 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 9307 else 9308 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 9309 if (regOpcH != X86::MOV32rr) 9310 MIB.addReg(t2); 9311 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 9312 9313 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9314 MIB.addReg(t1); 9315 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 9316 MIB.addReg(t2); 9317 9318 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 9319 MIB.addReg(t5); 9320 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 9321 MIB.addReg(t6); 9322 9323 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 9324 for (int i=0; i <= lastAddrIndx; ++i) 9325 (*MIB).addOperand(*argOpers[i]); 9326 9327 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9328 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9329 bInstr->memoperands_end()); 9330 9331 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 9332 MIB.addReg(X86::EAX); 9333 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 9334 MIB.addReg(X86::EDX); 9335 9336 // insert branch 9337 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9338 9339 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9340 return nextMBB; 9341} 9342 9343// private utility function 9344MachineBasicBlock * 9345X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 9346 MachineBasicBlock *MBB, 9347 unsigned cmovOpc) const { 9348 // For the atomic min/max operator, we generate 9349 // thisMBB: 9350 // newMBB: 9351 // ld t1 = [min/max.addr] 9352 // mov t2 = [min/max.val] 9353 // cmp t1, t2 9354 // cmov[cond] t2 = t1 9355 // mov EAX = t1 9356 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9357 // bz newMBB 9358 // fallthrough -->nextMBB 9359 // 9360 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9361 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9362 MachineFunction::iterator MBBIter = MBB; 9363 ++MBBIter; 9364 9365 /// First build the CFG 9366 MachineFunction *F = MBB->getParent(); 9367 MachineBasicBlock *thisMBB = MBB; 9368 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9369 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9370 F->insert(MBBIter, newMBB); 9371 F->insert(MBBIter, nextMBB); 9372 9373 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9374 nextMBB->splice(nextMBB->begin(), thisMBB, 9375 llvm::next(MachineBasicBlock::iterator(mInstr)), 9376 thisMBB->end()); 9377 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9378 9379 // Update thisMBB to fall through to newMBB 9380 thisMBB->addSuccessor(newMBB); 9381 9382 // newMBB jumps to newMBB and fall through to nextMBB 9383 newMBB->addSuccessor(nextMBB); 9384 newMBB->addSuccessor(newMBB); 9385 9386 DebugLoc dl = mInstr->getDebugLoc(); 9387 // Insert instructions into newMBB based on incoming instruction 9388 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9389 "unexpected number of operands"); 9390 MachineOperand& destOper = mInstr->getOperand(0); 9391 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9392 int numArgs = mInstr->getNumOperands() - 1; 9393 for (int i=0; i < numArgs; ++i) 9394 argOpers[i] = &mInstr->getOperand(i+1); 9395 9396 // x86 address has 4 operands: base, index, scale, and displacement 9397 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9398 int valArgIndx = lastAddrIndx + 1; 9399 9400 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9401 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 9402 for (int i=0; i <= lastAddrIndx; ++i) 9403 (*MIB).addOperand(*argOpers[i]); 9404 9405 // We only support register and immediate values 9406 assert((argOpers[valArgIndx]->isReg() || 9407 argOpers[valArgIndx]->isImm()) && 9408 "invalid operand"); 9409 9410 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9411 if (argOpers[valArgIndx]->isReg()) 9412 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 9413 else 9414 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 9415 (*MIB).addOperand(*argOpers[valArgIndx]); 9416 9417 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9418 MIB.addReg(t1); 9419 9420 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 9421 MIB.addReg(t1); 9422 MIB.addReg(t2); 9423 9424 // Generate movc 9425 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9426 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 9427 MIB.addReg(t2); 9428 MIB.addReg(t1); 9429 9430 // Cmp and exchange if none has modified the memory location 9431 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 9432 for (int i=0; i <= lastAddrIndx; ++i) 9433 (*MIB).addOperand(*argOpers[i]); 9434 MIB.addReg(t3); 9435 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9436 (*MIB).setMemRefs(mInstr->memoperands_begin(), 9437 mInstr->memoperands_end()); 9438 9439 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9440 MIB.addReg(X86::EAX); 9441 9442 // insert branch 9443 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9444 9445 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 9446 return nextMBB; 9447} 9448 9449// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 9450// or XMM0_V32I8 in AVX all of this code can be replaced with that 9451// in the .td file. 9452MachineBasicBlock * 9453X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 9454 unsigned numArgs, bool memArg) const { 9455 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 9456 "Target must have SSE4.2 or AVX features enabled"); 9457 9458 DebugLoc dl = MI->getDebugLoc(); 9459 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9460 unsigned Opc; 9461 if (!Subtarget->hasAVX()) { 9462 if (memArg) 9463 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 9464 else 9465 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 9466 } else { 9467 if (memArg) 9468 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 9469 else 9470 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 9471 } 9472 9473 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 9474 for (unsigned i = 0; i < numArgs; ++i) { 9475 MachineOperand &Op = MI->getOperand(i+1); 9476 if (!(Op.isReg() && Op.isImplicit())) 9477 MIB.addOperand(Op); 9478 } 9479 BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 9480 .addReg(X86::XMM0); 9481 9482 MI->eraseFromParent(); 9483 return BB; 9484} 9485 9486MachineBasicBlock * 9487X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 9488 DebugLoc dl = MI->getDebugLoc(); 9489 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9490 9491 // Address into RAX/EAX, other two args into ECX, EDX. 9492 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 9493 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 9494 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 9495 for (int i = 0; i < X86::AddrNumOperands; ++i) 9496 MIB.addOperand(MI->getOperand(i)); 9497 9498 unsigned ValOps = X86::AddrNumOperands; 9499 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 9500 .addReg(MI->getOperand(ValOps).getReg()); 9501 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 9502 .addReg(MI->getOperand(ValOps+1).getReg()); 9503 9504 // The instruction doesn't actually take any operands though. 9505 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 9506 9507 MI->eraseFromParent(); // The pseudo is gone now. 9508 return BB; 9509} 9510 9511MachineBasicBlock * 9512X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 9513 DebugLoc dl = MI->getDebugLoc(); 9514 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9515 9516 // First arg in ECX, the second in EAX. 9517 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 9518 .addReg(MI->getOperand(0).getReg()); 9519 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 9520 .addReg(MI->getOperand(1).getReg()); 9521 9522 // The instruction doesn't actually take any operands though. 9523 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 9524 9525 MI->eraseFromParent(); // The pseudo is gone now. 9526 return BB; 9527} 9528 9529MachineBasicBlock * 9530X86TargetLowering::EmitVAARG64WithCustomInserter( 9531 MachineInstr *MI, 9532 MachineBasicBlock *MBB) const { 9533 // Emit va_arg instruction on X86-64. 9534 9535 // Operands to this pseudo-instruction: 9536 // 0 ) Output : destination address (reg) 9537 // 1-5) Input : va_list address (addr, i64mem) 9538 // 6 ) ArgSize : Size (in bytes) of vararg type 9539 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 9540 // 8 ) Align : Alignment of type 9541 // 9 ) EFLAGS (implicit-def) 9542 9543 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 9544 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 9545 9546 unsigned DestReg = MI->getOperand(0).getReg(); 9547 MachineOperand &Base = MI->getOperand(1); 9548 MachineOperand &Scale = MI->getOperand(2); 9549 MachineOperand &Index = MI->getOperand(3); 9550 MachineOperand &Disp = MI->getOperand(4); 9551 MachineOperand &Segment = MI->getOperand(5); 9552 unsigned ArgSize = MI->getOperand(6).getImm(); 9553 unsigned ArgMode = MI->getOperand(7).getImm(); 9554 unsigned Align = MI->getOperand(8).getImm(); 9555 9556 // Memory Reference 9557 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 9558 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 9559 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 9560 9561 // Machine Information 9562 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9563 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 9564 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 9565 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 9566 DebugLoc DL = MI->getDebugLoc(); 9567 9568 // struct va_list { 9569 // i32 gp_offset 9570 // i32 fp_offset 9571 // i64 overflow_area (address) 9572 // i64 reg_save_area (address) 9573 // } 9574 // sizeof(va_list) = 24 9575 // alignment(va_list) = 8 9576 9577 unsigned TotalNumIntRegs = 6; 9578 unsigned TotalNumXMMRegs = 8; 9579 bool UseGPOffset = (ArgMode == 1); 9580 bool UseFPOffset = (ArgMode == 2); 9581 unsigned MaxOffset = TotalNumIntRegs * 8 + 9582 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 9583 9584 /* Align ArgSize to a multiple of 8 */ 9585 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 9586 bool NeedsAlign = (Align > 8); 9587 9588 MachineBasicBlock *thisMBB = MBB; 9589 MachineBasicBlock *overflowMBB; 9590 MachineBasicBlock *offsetMBB; 9591 MachineBasicBlock *endMBB; 9592 9593 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 9594 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 9595 unsigned OffsetReg = 0; 9596 9597 if (!UseGPOffset && !UseFPOffset) { 9598 // If we only pull from the overflow region, we don't create a branch. 9599 // We don't need to alter control flow. 9600 OffsetDestReg = 0; // unused 9601 OverflowDestReg = DestReg; 9602 9603 offsetMBB = NULL; 9604 overflowMBB = thisMBB; 9605 endMBB = thisMBB; 9606 } else { 9607 // First emit code to check if gp_offset (or fp_offset) is below the bound. 9608 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 9609 // If not, pull from overflow_area. (branch to overflowMBB) 9610 // 9611 // thisMBB 9612 // | . 9613 // | . 9614 // offsetMBB overflowMBB 9615 // | . 9616 // | . 9617 // endMBB 9618 9619 // Registers for the PHI in endMBB 9620 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 9621 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 9622 9623 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9624 MachineFunction *MF = MBB->getParent(); 9625 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9626 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9627 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9628 9629 MachineFunction::iterator MBBIter = MBB; 9630 ++MBBIter; 9631 9632 // Insert the new basic blocks 9633 MF->insert(MBBIter, offsetMBB); 9634 MF->insert(MBBIter, overflowMBB); 9635 MF->insert(MBBIter, endMBB); 9636 9637 // Transfer the remainder of MBB and its successor edges to endMBB. 9638 endMBB->splice(endMBB->begin(), thisMBB, 9639 llvm::next(MachineBasicBlock::iterator(MI)), 9640 thisMBB->end()); 9641 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9642 9643 // Make offsetMBB and overflowMBB successors of thisMBB 9644 thisMBB->addSuccessor(offsetMBB); 9645 thisMBB->addSuccessor(overflowMBB); 9646 9647 // endMBB is a successor of both offsetMBB and overflowMBB 9648 offsetMBB->addSuccessor(endMBB); 9649 overflowMBB->addSuccessor(endMBB); 9650 9651 // Load the offset value into a register 9652 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 9653 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 9654 .addOperand(Base) 9655 .addOperand(Scale) 9656 .addOperand(Index) 9657 .addDisp(Disp, UseFPOffset ? 4 : 0) 9658 .addOperand(Segment) 9659 .setMemRefs(MMOBegin, MMOEnd); 9660 9661 // Check if there is enough room left to pull this argument. 9662 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 9663 .addReg(OffsetReg) 9664 .addImm(MaxOffset + 8 - ArgSizeA8); 9665 9666 // Branch to "overflowMBB" if offset >= max 9667 // Fall through to "offsetMBB" otherwise 9668 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 9669 .addMBB(overflowMBB); 9670 } 9671 9672 // In offsetMBB, emit code to use the reg_save_area. 9673 if (offsetMBB) { 9674 assert(OffsetReg != 0); 9675 9676 // Read the reg_save_area address. 9677 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 9678 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 9679 .addOperand(Base) 9680 .addOperand(Scale) 9681 .addOperand(Index) 9682 .addDisp(Disp, 16) 9683 .addOperand(Segment) 9684 .setMemRefs(MMOBegin, MMOEnd); 9685 9686 // Zero-extend the offset 9687 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 9688 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 9689 .addImm(0) 9690 .addReg(OffsetReg) 9691 .addImm(X86::sub_32bit); 9692 9693 // Add the offset to the reg_save_area to get the final address. 9694 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 9695 .addReg(OffsetReg64) 9696 .addReg(RegSaveReg); 9697 9698 // Compute the offset for the next argument 9699 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 9700 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 9701 .addReg(OffsetReg) 9702 .addImm(UseFPOffset ? 16 : 8); 9703 9704 // Store it back into the va_list. 9705 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 9706 .addOperand(Base) 9707 .addOperand(Scale) 9708 .addOperand(Index) 9709 .addDisp(Disp, UseFPOffset ? 4 : 0) 9710 .addOperand(Segment) 9711 .addReg(NextOffsetReg) 9712 .setMemRefs(MMOBegin, MMOEnd); 9713 9714 // Jump to endMBB 9715 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 9716 .addMBB(endMBB); 9717 } 9718 9719 // 9720 // Emit code to use overflow area 9721 // 9722 9723 // Load the overflow_area address into a register. 9724 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 9725 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 9726 .addOperand(Base) 9727 .addOperand(Scale) 9728 .addOperand(Index) 9729 .addDisp(Disp, 8) 9730 .addOperand(Segment) 9731 .setMemRefs(MMOBegin, MMOEnd); 9732 9733 // If we need to align it, do so. Otherwise, just copy the address 9734 // to OverflowDestReg. 9735 if (NeedsAlign) { 9736 // Align the overflow address 9737 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 9738 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 9739 9740 // aligned_addr = (addr + (align-1)) & ~(align-1) 9741 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 9742 .addReg(OverflowAddrReg) 9743 .addImm(Align-1); 9744 9745 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 9746 .addReg(TmpReg) 9747 .addImm(~(uint64_t)(Align-1)); 9748 } else { 9749 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 9750 .addReg(OverflowAddrReg); 9751 } 9752 9753 // Compute the next overflow address after this argument. 9754 // (the overflow address should be kept 8-byte aligned) 9755 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 9756 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 9757 .addReg(OverflowDestReg) 9758 .addImm(ArgSizeA8); 9759 9760 // Store the new overflow address. 9761 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 9762 .addOperand(Base) 9763 .addOperand(Scale) 9764 .addOperand(Index) 9765 .addDisp(Disp, 8) 9766 .addOperand(Segment) 9767 .addReg(NextAddrReg) 9768 .setMemRefs(MMOBegin, MMOEnd); 9769 9770 // If we branched, emit the PHI to the front of endMBB. 9771 if (offsetMBB) { 9772 BuildMI(*endMBB, endMBB->begin(), DL, 9773 TII->get(X86::PHI), DestReg) 9774 .addReg(OffsetDestReg).addMBB(offsetMBB) 9775 .addReg(OverflowDestReg).addMBB(overflowMBB); 9776 } 9777 9778 // Erase the pseudo instruction 9779 MI->eraseFromParent(); 9780 9781 return endMBB; 9782} 9783 9784MachineBasicBlock * 9785X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 9786 MachineInstr *MI, 9787 MachineBasicBlock *MBB) const { 9788 // Emit code to save XMM registers to the stack. The ABI says that the 9789 // number of registers to save is given in %al, so it's theoretically 9790 // possible to do an indirect jump trick to avoid saving all of them, 9791 // however this code takes a simpler approach and just executes all 9792 // of the stores if %al is non-zero. It's less code, and it's probably 9793 // easier on the hardware branch predictor, and stores aren't all that 9794 // expensive anyway. 9795 9796 // Create the new basic blocks. One block contains all the XMM stores, 9797 // and one block is the final destination regardless of whether any 9798 // stores were performed. 9799 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9800 MachineFunction *F = MBB->getParent(); 9801 MachineFunction::iterator MBBIter = MBB; 9802 ++MBBIter; 9803 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 9804 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 9805 F->insert(MBBIter, XMMSaveMBB); 9806 F->insert(MBBIter, EndMBB); 9807 9808 // Transfer the remainder of MBB and its successor edges to EndMBB. 9809 EndMBB->splice(EndMBB->begin(), MBB, 9810 llvm::next(MachineBasicBlock::iterator(MI)), 9811 MBB->end()); 9812 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 9813 9814 // The original block will now fall through to the XMM save block. 9815 MBB->addSuccessor(XMMSaveMBB); 9816 // The XMMSaveMBB will fall through to the end block. 9817 XMMSaveMBB->addSuccessor(EndMBB); 9818 9819 // Now add the instructions. 9820 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9821 DebugLoc DL = MI->getDebugLoc(); 9822 9823 unsigned CountReg = MI->getOperand(0).getReg(); 9824 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 9825 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 9826 9827 if (!Subtarget->isTargetWin64()) { 9828 // If %al is 0, branch around the XMM save block. 9829 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 9830 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 9831 MBB->addSuccessor(EndMBB); 9832 } 9833 9834 // In the XMM save block, save all the XMM argument registers. 9835 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 9836 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 9837 MachineMemOperand *MMO = 9838 F->getMachineMemOperand( 9839 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 9840 MachineMemOperand::MOStore, 9841 /*Size=*/16, /*Align=*/16); 9842 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 9843 .addFrameIndex(RegSaveFrameIndex) 9844 .addImm(/*Scale=*/1) 9845 .addReg(/*IndexReg=*/0) 9846 .addImm(/*Disp=*/Offset) 9847 .addReg(/*Segment=*/0) 9848 .addReg(MI->getOperand(i).getReg()) 9849 .addMemOperand(MMO); 9850 } 9851 9852 MI->eraseFromParent(); // The pseudo instruction is gone now. 9853 9854 return EndMBB; 9855} 9856 9857MachineBasicBlock * 9858X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 9859 MachineBasicBlock *BB) const { 9860 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9861 DebugLoc DL = MI->getDebugLoc(); 9862 9863 // To "insert" a SELECT_CC instruction, we actually have to insert the 9864 // diamond control-flow pattern. The incoming instruction knows the 9865 // destination vreg to set, the condition code register to branch on, the 9866 // true/false values to select between, and a branch opcode to use. 9867 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9868 MachineFunction::iterator It = BB; 9869 ++It; 9870 9871 // thisMBB: 9872 // ... 9873 // TrueVal = ... 9874 // cmpTY ccX, r1, r2 9875 // bCC copy1MBB 9876 // fallthrough --> copy0MBB 9877 MachineBasicBlock *thisMBB = BB; 9878 MachineFunction *F = BB->getParent(); 9879 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 9880 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9881 F->insert(It, copy0MBB); 9882 F->insert(It, sinkMBB); 9883 9884 // If the EFLAGS register isn't dead in the terminator, then claim that it's 9885 // live into the sink and copy blocks. 9886 const MachineFunction *MF = BB->getParent(); 9887 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 9888 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 9889 9890 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 9891 const MachineOperand &MO = MI->getOperand(I); 9892 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 9893 unsigned Reg = MO.getReg(); 9894 if (Reg != X86::EFLAGS) continue; 9895 copy0MBB->addLiveIn(Reg); 9896 sinkMBB->addLiveIn(Reg); 9897 } 9898 9899 // Transfer the remainder of BB and its successor edges to sinkMBB. 9900 sinkMBB->splice(sinkMBB->begin(), BB, 9901 llvm::next(MachineBasicBlock::iterator(MI)), 9902 BB->end()); 9903 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9904 9905 // Add the true and fallthrough blocks as its successors. 9906 BB->addSuccessor(copy0MBB); 9907 BB->addSuccessor(sinkMBB); 9908 9909 // Create the conditional branch instruction. 9910 unsigned Opc = 9911 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 9912 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 9913 9914 // copy0MBB: 9915 // %FalseValue = ... 9916 // # fallthrough to sinkMBB 9917 copy0MBB->addSuccessor(sinkMBB); 9918 9919 // sinkMBB: 9920 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 9921 // ... 9922 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 9923 TII->get(X86::PHI), MI->getOperand(0).getReg()) 9924 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 9925 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 9926 9927 MI->eraseFromParent(); // The pseudo instruction is gone now. 9928 return sinkMBB; 9929} 9930 9931MachineBasicBlock * 9932X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 9933 MachineBasicBlock *BB) const { 9934 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9935 DebugLoc DL = MI->getDebugLoc(); 9936 9937 // The lowering is pretty easy: we're just emitting the call to _alloca. The 9938 // non-trivial part is impdef of ESP. 9939 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 9940 // mingw-w64. 9941 9942 const char *StackProbeSymbol = 9943 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 9944 9945 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 9946 .addExternalSymbol(StackProbeSymbol) 9947 .addReg(X86::EAX, RegState::Implicit) 9948 .addReg(X86::ESP, RegState::Implicit) 9949 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 9950 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 9951 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 9952 9953 MI->eraseFromParent(); // The pseudo instruction is gone now. 9954 return BB; 9955} 9956 9957MachineBasicBlock * 9958X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 9959 MachineBasicBlock *BB) const { 9960 // This is pretty easy. We're taking the value that we received from 9961 // our load from the relocation, sticking it in either RDI (x86-64) 9962 // or EAX and doing an indirect call. The return value will then 9963 // be in the normal return register. 9964 const X86InstrInfo *TII 9965 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 9966 DebugLoc DL = MI->getDebugLoc(); 9967 MachineFunction *F = BB->getParent(); 9968 9969 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 9970 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 9971 9972 if (Subtarget->is64Bit()) { 9973 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9974 TII->get(X86::MOV64rm), X86::RDI) 9975 .addReg(X86::RIP) 9976 .addImm(0).addReg(0) 9977 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9978 MI->getOperand(3).getTargetFlags()) 9979 .addReg(0); 9980 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 9981 addDirectMem(MIB, X86::RDI); 9982 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 9983 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9984 TII->get(X86::MOV32rm), X86::EAX) 9985 .addReg(0) 9986 .addImm(0).addReg(0) 9987 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9988 MI->getOperand(3).getTargetFlags()) 9989 .addReg(0); 9990 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 9991 addDirectMem(MIB, X86::EAX); 9992 } else { 9993 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9994 TII->get(X86::MOV32rm), X86::EAX) 9995 .addReg(TII->getGlobalBaseReg(F)) 9996 .addImm(0).addReg(0) 9997 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9998 MI->getOperand(3).getTargetFlags()) 9999 .addReg(0); 10000 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10001 addDirectMem(MIB, X86::EAX); 10002 } 10003 10004 MI->eraseFromParent(); // The pseudo instruction is gone now. 10005 return BB; 10006} 10007 10008MachineBasicBlock * 10009X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 10010 MachineBasicBlock *BB) const { 10011 switch (MI->getOpcode()) { 10012 default: assert(false && "Unexpected instr type to insert"); 10013 case X86::WIN_ALLOCA: 10014 return EmitLoweredWinAlloca(MI, BB); 10015 case X86::TLSCall_32: 10016 case X86::TLSCall_64: 10017 return EmitLoweredTLSCall(MI, BB); 10018 case X86::CMOV_GR8: 10019 case X86::CMOV_FR32: 10020 case X86::CMOV_FR64: 10021 case X86::CMOV_V4F32: 10022 case X86::CMOV_V2F64: 10023 case X86::CMOV_V2I64: 10024 case X86::CMOV_GR16: 10025 case X86::CMOV_GR32: 10026 case X86::CMOV_RFP32: 10027 case X86::CMOV_RFP64: 10028 case X86::CMOV_RFP80: 10029 return EmitLoweredSelect(MI, BB); 10030 10031 case X86::FP32_TO_INT16_IN_MEM: 10032 case X86::FP32_TO_INT32_IN_MEM: 10033 case X86::FP32_TO_INT64_IN_MEM: 10034 case X86::FP64_TO_INT16_IN_MEM: 10035 case X86::FP64_TO_INT32_IN_MEM: 10036 case X86::FP64_TO_INT64_IN_MEM: 10037 case X86::FP80_TO_INT16_IN_MEM: 10038 case X86::FP80_TO_INT32_IN_MEM: 10039 case X86::FP80_TO_INT64_IN_MEM: { 10040 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10041 DebugLoc DL = MI->getDebugLoc(); 10042 10043 // Change the floating point control register to use "round towards zero" 10044 // mode when truncating to an integer value. 10045 MachineFunction *F = BB->getParent(); 10046 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 10047 addFrameReference(BuildMI(*BB, MI, DL, 10048 TII->get(X86::FNSTCW16m)), CWFrameIdx); 10049 10050 // Load the old value of the high byte of the control word... 10051 unsigned OldCW = 10052 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 10053 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 10054 CWFrameIdx); 10055 10056 // Set the high part to be round to zero... 10057 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 10058 .addImm(0xC7F); 10059 10060 // Reload the modified control word now... 10061 addFrameReference(BuildMI(*BB, MI, DL, 10062 TII->get(X86::FLDCW16m)), CWFrameIdx); 10063 10064 // Restore the memory image of control word to original value 10065 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 10066 .addReg(OldCW); 10067 10068 // Get the X86 opcode to use. 10069 unsigned Opc; 10070 switch (MI->getOpcode()) { 10071 default: llvm_unreachable("illegal opcode!"); 10072 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 10073 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 10074 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 10075 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 10076 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 10077 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 10078 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 10079 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 10080 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 10081 } 10082 10083 X86AddressMode AM; 10084 MachineOperand &Op = MI->getOperand(0); 10085 if (Op.isReg()) { 10086 AM.BaseType = X86AddressMode::RegBase; 10087 AM.Base.Reg = Op.getReg(); 10088 } else { 10089 AM.BaseType = X86AddressMode::FrameIndexBase; 10090 AM.Base.FrameIndex = Op.getIndex(); 10091 } 10092 Op = MI->getOperand(1); 10093 if (Op.isImm()) 10094 AM.Scale = Op.getImm(); 10095 Op = MI->getOperand(2); 10096 if (Op.isImm()) 10097 AM.IndexReg = Op.getImm(); 10098 Op = MI->getOperand(3); 10099 if (Op.isGlobal()) { 10100 AM.GV = Op.getGlobal(); 10101 } else { 10102 AM.Disp = Op.getImm(); 10103 } 10104 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 10105 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 10106 10107 // Reload the original control word now. 10108 addFrameReference(BuildMI(*BB, MI, DL, 10109 TII->get(X86::FLDCW16m)), CWFrameIdx); 10110 10111 MI->eraseFromParent(); // The pseudo instruction is gone now. 10112 return BB; 10113 } 10114 // String/text processing lowering. 10115 case X86::PCMPISTRM128REG: 10116 case X86::VPCMPISTRM128REG: 10117 return EmitPCMP(MI, BB, 3, false /* in-mem */); 10118 case X86::PCMPISTRM128MEM: 10119 case X86::VPCMPISTRM128MEM: 10120 return EmitPCMP(MI, BB, 3, true /* in-mem */); 10121 case X86::PCMPESTRM128REG: 10122 case X86::VPCMPESTRM128REG: 10123 return EmitPCMP(MI, BB, 5, false /* in mem */); 10124 case X86::PCMPESTRM128MEM: 10125 case X86::VPCMPESTRM128MEM: 10126 return EmitPCMP(MI, BB, 5, true /* in mem */); 10127 10128 // Thread synchronization. 10129 case X86::MONITOR: 10130 return EmitMonitor(MI, BB); 10131 case X86::MWAIT: 10132 return EmitMwait(MI, BB); 10133 10134 // Atomic Lowering. 10135 case X86::ATOMAND32: 10136 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10137 X86::AND32ri, X86::MOV32rm, 10138 X86::LCMPXCHG32, 10139 X86::NOT32r, X86::EAX, 10140 X86::GR32RegisterClass); 10141 case X86::ATOMOR32: 10142 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 10143 X86::OR32ri, X86::MOV32rm, 10144 X86::LCMPXCHG32, 10145 X86::NOT32r, X86::EAX, 10146 X86::GR32RegisterClass); 10147 case X86::ATOMXOR32: 10148 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 10149 X86::XOR32ri, X86::MOV32rm, 10150 X86::LCMPXCHG32, 10151 X86::NOT32r, X86::EAX, 10152 X86::GR32RegisterClass); 10153 case X86::ATOMNAND32: 10154 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10155 X86::AND32ri, X86::MOV32rm, 10156 X86::LCMPXCHG32, 10157 X86::NOT32r, X86::EAX, 10158 X86::GR32RegisterClass, true); 10159 case X86::ATOMMIN32: 10160 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 10161 case X86::ATOMMAX32: 10162 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 10163 case X86::ATOMUMIN32: 10164 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 10165 case X86::ATOMUMAX32: 10166 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 10167 10168 case X86::ATOMAND16: 10169 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10170 X86::AND16ri, X86::MOV16rm, 10171 X86::LCMPXCHG16, 10172 X86::NOT16r, X86::AX, 10173 X86::GR16RegisterClass); 10174 case X86::ATOMOR16: 10175 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 10176 X86::OR16ri, X86::MOV16rm, 10177 X86::LCMPXCHG16, 10178 X86::NOT16r, X86::AX, 10179 X86::GR16RegisterClass); 10180 case X86::ATOMXOR16: 10181 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 10182 X86::XOR16ri, X86::MOV16rm, 10183 X86::LCMPXCHG16, 10184 X86::NOT16r, X86::AX, 10185 X86::GR16RegisterClass); 10186 case X86::ATOMNAND16: 10187 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10188 X86::AND16ri, X86::MOV16rm, 10189 X86::LCMPXCHG16, 10190 X86::NOT16r, X86::AX, 10191 X86::GR16RegisterClass, true); 10192 case X86::ATOMMIN16: 10193 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 10194 case X86::ATOMMAX16: 10195 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 10196 case X86::ATOMUMIN16: 10197 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 10198 case X86::ATOMUMAX16: 10199 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 10200 10201 case X86::ATOMAND8: 10202 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10203 X86::AND8ri, X86::MOV8rm, 10204 X86::LCMPXCHG8, 10205 X86::NOT8r, X86::AL, 10206 X86::GR8RegisterClass); 10207 case X86::ATOMOR8: 10208 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 10209 X86::OR8ri, X86::MOV8rm, 10210 X86::LCMPXCHG8, 10211 X86::NOT8r, X86::AL, 10212 X86::GR8RegisterClass); 10213 case X86::ATOMXOR8: 10214 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 10215 X86::XOR8ri, X86::MOV8rm, 10216 X86::LCMPXCHG8, 10217 X86::NOT8r, X86::AL, 10218 X86::GR8RegisterClass); 10219 case X86::ATOMNAND8: 10220 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10221 X86::AND8ri, X86::MOV8rm, 10222 X86::LCMPXCHG8, 10223 X86::NOT8r, X86::AL, 10224 X86::GR8RegisterClass, true); 10225 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 10226 // This group is for 64-bit host. 10227 case X86::ATOMAND64: 10228 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10229 X86::AND64ri32, X86::MOV64rm, 10230 X86::LCMPXCHG64, 10231 X86::NOT64r, X86::RAX, 10232 X86::GR64RegisterClass); 10233 case X86::ATOMOR64: 10234 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 10235 X86::OR64ri32, X86::MOV64rm, 10236 X86::LCMPXCHG64, 10237 X86::NOT64r, X86::RAX, 10238 X86::GR64RegisterClass); 10239 case X86::ATOMXOR64: 10240 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 10241 X86::XOR64ri32, X86::MOV64rm, 10242 X86::LCMPXCHG64, 10243 X86::NOT64r, X86::RAX, 10244 X86::GR64RegisterClass); 10245 case X86::ATOMNAND64: 10246 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10247 X86::AND64ri32, X86::MOV64rm, 10248 X86::LCMPXCHG64, 10249 X86::NOT64r, X86::RAX, 10250 X86::GR64RegisterClass, true); 10251 case X86::ATOMMIN64: 10252 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 10253 case X86::ATOMMAX64: 10254 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 10255 case X86::ATOMUMIN64: 10256 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 10257 case X86::ATOMUMAX64: 10258 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 10259 10260 // This group does 64-bit operations on a 32-bit host. 10261 case X86::ATOMAND6432: 10262 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10263 X86::AND32rr, X86::AND32rr, 10264 X86::AND32ri, X86::AND32ri, 10265 false); 10266 case X86::ATOMOR6432: 10267 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10268 X86::OR32rr, X86::OR32rr, 10269 X86::OR32ri, X86::OR32ri, 10270 false); 10271 case X86::ATOMXOR6432: 10272 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10273 X86::XOR32rr, X86::XOR32rr, 10274 X86::XOR32ri, X86::XOR32ri, 10275 false); 10276 case X86::ATOMNAND6432: 10277 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10278 X86::AND32rr, X86::AND32rr, 10279 X86::AND32ri, X86::AND32ri, 10280 true); 10281 case X86::ATOMADD6432: 10282 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10283 X86::ADD32rr, X86::ADC32rr, 10284 X86::ADD32ri, X86::ADC32ri, 10285 false); 10286 case X86::ATOMSUB6432: 10287 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10288 X86::SUB32rr, X86::SBB32rr, 10289 X86::SUB32ri, X86::SBB32ri, 10290 false); 10291 case X86::ATOMSWAP6432: 10292 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10293 X86::MOV32rr, X86::MOV32rr, 10294 X86::MOV32ri, X86::MOV32ri, 10295 false); 10296 case X86::VASTART_SAVE_XMM_REGS: 10297 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 10298 10299 case X86::VAARG_64: 10300 return EmitVAARG64WithCustomInserter(MI, BB); 10301 } 10302} 10303 10304//===----------------------------------------------------------------------===// 10305// X86 Optimization Hooks 10306//===----------------------------------------------------------------------===// 10307 10308void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 10309 const APInt &Mask, 10310 APInt &KnownZero, 10311 APInt &KnownOne, 10312 const SelectionDAG &DAG, 10313 unsigned Depth) const { 10314 unsigned Opc = Op.getOpcode(); 10315 assert((Opc >= ISD::BUILTIN_OP_END || 10316 Opc == ISD::INTRINSIC_WO_CHAIN || 10317 Opc == ISD::INTRINSIC_W_CHAIN || 10318 Opc == ISD::INTRINSIC_VOID) && 10319 "Should use MaskedValueIsZero if you don't know whether Op" 10320 " is a target node!"); 10321 10322 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 10323 switch (Opc) { 10324 default: break; 10325 case X86ISD::ADD: 10326 case X86ISD::SUB: 10327 case X86ISD::SMUL: 10328 case X86ISD::UMUL: 10329 case X86ISD::INC: 10330 case X86ISD::DEC: 10331 case X86ISD::OR: 10332 case X86ISD::XOR: 10333 case X86ISD::AND: 10334 // These nodes' second result is a boolean. 10335 if (Op.getResNo() == 0) 10336 break; 10337 // Fallthrough 10338 case X86ISD::SETCC: 10339 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 10340 Mask.getBitWidth() - 1); 10341 break; 10342 } 10343} 10344 10345unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 10346 unsigned Depth) const { 10347 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 10348 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 10349 return Op.getValueType().getScalarType().getSizeInBits(); 10350 10351 // Fallback case. 10352 return 1; 10353} 10354 10355/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 10356/// node is a GlobalAddress + offset. 10357bool X86TargetLowering::isGAPlusOffset(SDNode *N, 10358 const GlobalValue* &GA, 10359 int64_t &Offset) const { 10360 if (N->getOpcode() == X86ISD::Wrapper) { 10361 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 10362 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 10363 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 10364 return true; 10365 } 10366 } 10367 return TargetLowering::isGAPlusOffset(N, GA, Offset); 10368} 10369 10370/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 10371/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 10372/// if the load addresses are consecutive, non-overlapping, and in the right 10373/// order. 10374static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 10375 TargetLowering::DAGCombinerInfo &DCI) { 10376 DebugLoc dl = N->getDebugLoc(); 10377 EVT VT = N->getValueType(0); 10378 10379 if (VT.getSizeInBits() != 128) 10380 return SDValue(); 10381 10382 // Don't create instructions with illegal types after legalize types has run. 10383 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10384 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 10385 return SDValue(); 10386 10387 SmallVector<SDValue, 16> Elts; 10388 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 10389 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 10390 10391 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 10392} 10393 10394/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 10395/// generation and convert it from being a bunch of shuffles and extracts 10396/// to a simple store and scalar loads to extract the elements. 10397static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 10398 const TargetLowering &TLI) { 10399 SDValue InputVector = N->getOperand(0); 10400 10401 // Only operate on vectors of 4 elements, where the alternative shuffling 10402 // gets to be more expensive. 10403 if (InputVector.getValueType() != MVT::v4i32) 10404 return SDValue(); 10405 10406 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 10407 // single use which is a sign-extend or zero-extend, and all elements are 10408 // used. 10409 SmallVector<SDNode *, 4> Uses; 10410 unsigned ExtractedElements = 0; 10411 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 10412 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 10413 if (UI.getUse().getResNo() != InputVector.getResNo()) 10414 return SDValue(); 10415 10416 SDNode *Extract = *UI; 10417 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 10418 return SDValue(); 10419 10420 if (Extract->getValueType(0) != MVT::i32) 10421 return SDValue(); 10422 if (!Extract->hasOneUse()) 10423 return SDValue(); 10424 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 10425 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 10426 return SDValue(); 10427 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 10428 return SDValue(); 10429 10430 // Record which element was extracted. 10431 ExtractedElements |= 10432 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 10433 10434 Uses.push_back(Extract); 10435 } 10436 10437 // If not all the elements were used, this may not be worthwhile. 10438 if (ExtractedElements != 15) 10439 return SDValue(); 10440 10441 // Ok, we've now decided to do the transformation. 10442 DebugLoc dl = InputVector.getDebugLoc(); 10443 10444 // Store the value to a temporary stack slot. 10445 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 10446 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 10447 MachinePointerInfo(), false, false, 0); 10448 10449 // Replace each use (extract) with a load of the appropriate element. 10450 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 10451 UE = Uses.end(); UI != UE; ++UI) { 10452 SDNode *Extract = *UI; 10453 10454 // Compute the element's address. 10455 SDValue Idx = Extract->getOperand(1); 10456 unsigned EltSize = 10457 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 10458 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 10459 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 10460 10461 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 10462 StackPtr, OffsetVal); 10463 10464 // Load the scalar. 10465 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 10466 ScalarAddr, MachinePointerInfo(), 10467 false, false, 0); 10468 10469 // Replace the exact with the load. 10470 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 10471 } 10472 10473 // The replacement was made in place; don't return anything. 10474 return SDValue(); 10475} 10476 10477/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 10478static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 10479 const X86Subtarget *Subtarget) { 10480 DebugLoc DL = N->getDebugLoc(); 10481 SDValue Cond = N->getOperand(0); 10482 // Get the LHS/RHS of the select. 10483 SDValue LHS = N->getOperand(1); 10484 SDValue RHS = N->getOperand(2); 10485 10486 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 10487 // instructions match the semantics of the common C idiom x<y?x:y but not 10488 // x<=y?x:y, because of how they handle negative zero (which can be 10489 // ignored in unsafe-math mode). 10490 if (Subtarget->hasSSE2() && 10491 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 10492 Cond.getOpcode() == ISD::SETCC) { 10493 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 10494 10495 unsigned Opcode = 0; 10496 // Check for x CC y ? x : y. 10497 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 10498 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 10499 switch (CC) { 10500 default: break; 10501 case ISD::SETULT: 10502 // Converting this to a min would handle NaNs incorrectly, and swapping 10503 // the operands would cause it to handle comparisons between positive 10504 // and negative zero incorrectly. 10505 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10506 if (!UnsafeFPMath && 10507 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10508 break; 10509 std::swap(LHS, RHS); 10510 } 10511 Opcode = X86ISD::FMIN; 10512 break; 10513 case ISD::SETOLE: 10514 // Converting this to a min would handle comparisons between positive 10515 // and negative zero incorrectly. 10516 if (!UnsafeFPMath && 10517 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 10518 break; 10519 Opcode = X86ISD::FMIN; 10520 break; 10521 case ISD::SETULE: 10522 // Converting this to a min would handle both negative zeros and NaNs 10523 // incorrectly, but we can swap the operands to fix both. 10524 std::swap(LHS, RHS); 10525 case ISD::SETOLT: 10526 case ISD::SETLT: 10527 case ISD::SETLE: 10528 Opcode = X86ISD::FMIN; 10529 break; 10530 10531 case ISD::SETOGE: 10532 // Converting this to a max would handle comparisons between positive 10533 // and negative zero incorrectly. 10534 if (!UnsafeFPMath && 10535 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 10536 break; 10537 Opcode = X86ISD::FMAX; 10538 break; 10539 case ISD::SETUGT: 10540 // Converting this to a max would handle NaNs incorrectly, and swapping 10541 // the operands would cause it to handle comparisons between positive 10542 // and negative zero incorrectly. 10543 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10544 if (!UnsafeFPMath && 10545 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10546 break; 10547 std::swap(LHS, RHS); 10548 } 10549 Opcode = X86ISD::FMAX; 10550 break; 10551 case ISD::SETUGE: 10552 // Converting this to a max would handle both negative zeros and NaNs 10553 // incorrectly, but we can swap the operands to fix both. 10554 std::swap(LHS, RHS); 10555 case ISD::SETOGT: 10556 case ISD::SETGT: 10557 case ISD::SETGE: 10558 Opcode = X86ISD::FMAX; 10559 break; 10560 } 10561 // Check for x CC y ? y : x -- a min/max with reversed arms. 10562 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 10563 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 10564 switch (CC) { 10565 default: break; 10566 case ISD::SETOGE: 10567 // Converting this to a min would handle comparisons between positive 10568 // and negative zero incorrectly, and swapping the operands would 10569 // cause it to handle NaNs incorrectly. 10570 if (!UnsafeFPMath && 10571 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 10572 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10573 break; 10574 std::swap(LHS, RHS); 10575 } 10576 Opcode = X86ISD::FMIN; 10577 break; 10578 case ISD::SETUGT: 10579 // Converting this to a min would handle NaNs incorrectly. 10580 if (!UnsafeFPMath && 10581 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 10582 break; 10583 Opcode = X86ISD::FMIN; 10584 break; 10585 case ISD::SETUGE: 10586 // Converting this to a min would handle both negative zeros and NaNs 10587 // incorrectly, but we can swap the operands to fix both. 10588 std::swap(LHS, RHS); 10589 case ISD::SETOGT: 10590 case ISD::SETGT: 10591 case ISD::SETGE: 10592 Opcode = X86ISD::FMIN; 10593 break; 10594 10595 case ISD::SETULT: 10596 // Converting this to a max would handle NaNs incorrectly. 10597 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10598 break; 10599 Opcode = X86ISD::FMAX; 10600 break; 10601 case ISD::SETOLE: 10602 // Converting this to a max would handle comparisons between positive 10603 // and negative zero incorrectly, and swapping the operands would 10604 // cause it to handle NaNs incorrectly. 10605 if (!UnsafeFPMath && 10606 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 10607 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10608 break; 10609 std::swap(LHS, RHS); 10610 } 10611 Opcode = X86ISD::FMAX; 10612 break; 10613 case ISD::SETULE: 10614 // Converting this to a max would handle both negative zeros and NaNs 10615 // incorrectly, but we can swap the operands to fix both. 10616 std::swap(LHS, RHS); 10617 case ISD::SETOLT: 10618 case ISD::SETLT: 10619 case ISD::SETLE: 10620 Opcode = X86ISD::FMAX; 10621 break; 10622 } 10623 } 10624 10625 if (Opcode) 10626 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 10627 } 10628 10629 // If this is a select between two integer constants, try to do some 10630 // optimizations. 10631 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 10632 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 10633 // Don't do this for crazy integer types. 10634 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 10635 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 10636 // so that TrueC (the true value) is larger than FalseC. 10637 bool NeedsCondInvert = false; 10638 10639 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 10640 // Efficiently invertible. 10641 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 10642 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 10643 isa<ConstantSDNode>(Cond.getOperand(1))))) { 10644 NeedsCondInvert = true; 10645 std::swap(TrueC, FalseC); 10646 } 10647 10648 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 10649 if (FalseC->getAPIntValue() == 0 && 10650 TrueC->getAPIntValue().isPowerOf2()) { 10651 if (NeedsCondInvert) // Invert the condition if needed. 10652 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10653 DAG.getConstant(1, Cond.getValueType())); 10654 10655 // Zero extend the condition if needed. 10656 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 10657 10658 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10659 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 10660 DAG.getConstant(ShAmt, MVT::i8)); 10661 } 10662 10663 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 10664 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10665 if (NeedsCondInvert) // Invert the condition if needed. 10666 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10667 DAG.getConstant(1, Cond.getValueType())); 10668 10669 // Zero extend the condition if needed. 10670 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10671 FalseC->getValueType(0), Cond); 10672 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10673 SDValue(FalseC, 0)); 10674 } 10675 10676 // Optimize cases that will turn into an LEA instruction. This requires 10677 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10678 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10679 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10680 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10681 10682 bool isFastMultiplier = false; 10683 if (Diff < 10) { 10684 switch ((unsigned char)Diff) { 10685 default: break; 10686 case 1: // result = add base, cond 10687 case 2: // result = lea base( , cond*2) 10688 case 3: // result = lea base(cond, cond*2) 10689 case 4: // result = lea base( , cond*4) 10690 case 5: // result = lea base(cond, cond*4) 10691 case 8: // result = lea base( , cond*8) 10692 case 9: // result = lea base(cond, cond*8) 10693 isFastMultiplier = true; 10694 break; 10695 } 10696 } 10697 10698 if (isFastMultiplier) { 10699 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10700 if (NeedsCondInvert) // Invert the condition if needed. 10701 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10702 DAG.getConstant(1, Cond.getValueType())); 10703 10704 // Zero extend the condition if needed. 10705 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10706 Cond); 10707 // Scale the condition by the difference. 10708 if (Diff != 1) 10709 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10710 DAG.getConstant(Diff, Cond.getValueType())); 10711 10712 // Add the base if non-zero. 10713 if (FalseC->getAPIntValue() != 0) 10714 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10715 SDValue(FalseC, 0)); 10716 return Cond; 10717 } 10718 } 10719 } 10720 } 10721 10722 return SDValue(); 10723} 10724 10725/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 10726static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 10727 TargetLowering::DAGCombinerInfo &DCI) { 10728 DebugLoc DL = N->getDebugLoc(); 10729 10730 // If the flag operand isn't dead, don't touch this CMOV. 10731 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 10732 return SDValue(); 10733 10734 // If this is a select between two integer constants, try to do some 10735 // optimizations. Note that the operands are ordered the opposite of SELECT 10736 // operands. 10737 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 10738 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 10739 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 10740 // larger than FalseC (the false value). 10741 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 10742 10743 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 10744 CC = X86::GetOppositeBranchCondition(CC); 10745 std::swap(TrueC, FalseC); 10746 } 10747 10748 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 10749 // This is efficient for any integer data type (including i8/i16) and 10750 // shift amount. 10751 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 10752 SDValue Cond = N->getOperand(3); 10753 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10754 DAG.getConstant(CC, MVT::i8), Cond); 10755 10756 // Zero extend the condition if needed. 10757 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 10758 10759 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10760 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 10761 DAG.getConstant(ShAmt, MVT::i8)); 10762 if (N->getNumValues() == 2) // Dead flag value? 10763 return DCI.CombineTo(N, Cond, SDValue()); 10764 return Cond; 10765 } 10766 10767 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 10768 // for any integer data type, including i8/i16. 10769 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10770 SDValue Cond = N->getOperand(3); 10771 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10772 DAG.getConstant(CC, MVT::i8), Cond); 10773 10774 // Zero extend the condition if needed. 10775 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10776 FalseC->getValueType(0), Cond); 10777 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10778 SDValue(FalseC, 0)); 10779 10780 if (N->getNumValues() == 2) // Dead flag value? 10781 return DCI.CombineTo(N, Cond, SDValue()); 10782 return Cond; 10783 } 10784 10785 // Optimize cases that will turn into an LEA instruction. This requires 10786 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10787 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10788 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10789 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10790 10791 bool isFastMultiplier = false; 10792 if (Diff < 10) { 10793 switch ((unsigned char)Diff) { 10794 default: break; 10795 case 1: // result = add base, cond 10796 case 2: // result = lea base( , cond*2) 10797 case 3: // result = lea base(cond, cond*2) 10798 case 4: // result = lea base( , cond*4) 10799 case 5: // result = lea base(cond, cond*4) 10800 case 8: // result = lea base( , cond*8) 10801 case 9: // result = lea base(cond, cond*8) 10802 isFastMultiplier = true; 10803 break; 10804 } 10805 } 10806 10807 if (isFastMultiplier) { 10808 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10809 SDValue Cond = N->getOperand(3); 10810 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10811 DAG.getConstant(CC, MVT::i8), Cond); 10812 // Zero extend the condition if needed. 10813 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10814 Cond); 10815 // Scale the condition by the difference. 10816 if (Diff != 1) 10817 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10818 DAG.getConstant(Diff, Cond.getValueType())); 10819 10820 // Add the base if non-zero. 10821 if (FalseC->getAPIntValue() != 0) 10822 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10823 SDValue(FalseC, 0)); 10824 if (N->getNumValues() == 2) // Dead flag value? 10825 return DCI.CombineTo(N, Cond, SDValue()); 10826 return Cond; 10827 } 10828 } 10829 } 10830 } 10831 return SDValue(); 10832} 10833 10834 10835/// PerformMulCombine - Optimize a single multiply with constant into two 10836/// in order to implement it with two cheaper instructions, e.g. 10837/// LEA + SHL, LEA + LEA. 10838static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 10839 TargetLowering::DAGCombinerInfo &DCI) { 10840 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10841 return SDValue(); 10842 10843 EVT VT = N->getValueType(0); 10844 if (VT != MVT::i64) 10845 return SDValue(); 10846 10847 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10848 if (!C) 10849 return SDValue(); 10850 uint64_t MulAmt = C->getZExtValue(); 10851 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 10852 return SDValue(); 10853 10854 uint64_t MulAmt1 = 0; 10855 uint64_t MulAmt2 = 0; 10856 if ((MulAmt % 9) == 0) { 10857 MulAmt1 = 9; 10858 MulAmt2 = MulAmt / 9; 10859 } else if ((MulAmt % 5) == 0) { 10860 MulAmt1 = 5; 10861 MulAmt2 = MulAmt / 5; 10862 } else if ((MulAmt % 3) == 0) { 10863 MulAmt1 = 3; 10864 MulAmt2 = MulAmt / 3; 10865 } 10866 if (MulAmt2 && 10867 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 10868 DebugLoc DL = N->getDebugLoc(); 10869 10870 if (isPowerOf2_64(MulAmt2) && 10871 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 10872 // If second multiplifer is pow2, issue it first. We want the multiply by 10873 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 10874 // is an add. 10875 std::swap(MulAmt1, MulAmt2); 10876 10877 SDValue NewMul; 10878 if (isPowerOf2_64(MulAmt1)) 10879 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 10880 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 10881 else 10882 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 10883 DAG.getConstant(MulAmt1, VT)); 10884 10885 if (isPowerOf2_64(MulAmt2)) 10886 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 10887 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 10888 else 10889 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 10890 DAG.getConstant(MulAmt2, VT)); 10891 10892 // Do not add new nodes to DAG combiner worklist. 10893 DCI.CombineTo(N, NewMul, false); 10894 } 10895 return SDValue(); 10896} 10897 10898static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 10899 SDValue N0 = N->getOperand(0); 10900 SDValue N1 = N->getOperand(1); 10901 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 10902 EVT VT = N0.getValueType(); 10903 10904 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 10905 // since the result of setcc_c is all zero's or all ones. 10906 if (N1C && N0.getOpcode() == ISD::AND && 10907 N0.getOperand(1).getOpcode() == ISD::Constant) { 10908 SDValue N00 = N0.getOperand(0); 10909 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 10910 ((N00.getOpcode() == ISD::ANY_EXTEND || 10911 N00.getOpcode() == ISD::ZERO_EXTEND) && 10912 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 10913 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 10914 APInt ShAmt = N1C->getAPIntValue(); 10915 Mask = Mask.shl(ShAmt); 10916 if (Mask != 0) 10917 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 10918 N00, DAG.getConstant(Mask, VT)); 10919 } 10920 } 10921 10922 return SDValue(); 10923} 10924 10925/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 10926/// when possible. 10927static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 10928 const X86Subtarget *Subtarget) { 10929 EVT VT = N->getValueType(0); 10930 if (!VT.isVector() && VT.isInteger() && 10931 N->getOpcode() == ISD::SHL) 10932 return PerformSHLCombine(N, DAG); 10933 10934 // On X86 with SSE2 support, we can transform this to a vector shift if 10935 // all elements are shifted by the same amount. We can't do this in legalize 10936 // because the a constant vector is typically transformed to a constant pool 10937 // so we have no knowledge of the shift amount. 10938 if (!Subtarget->hasSSE2()) 10939 return SDValue(); 10940 10941 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 10942 return SDValue(); 10943 10944 SDValue ShAmtOp = N->getOperand(1); 10945 EVT EltVT = VT.getVectorElementType(); 10946 DebugLoc DL = N->getDebugLoc(); 10947 SDValue BaseShAmt = SDValue(); 10948 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 10949 unsigned NumElts = VT.getVectorNumElements(); 10950 unsigned i = 0; 10951 for (; i != NumElts; ++i) { 10952 SDValue Arg = ShAmtOp.getOperand(i); 10953 if (Arg.getOpcode() == ISD::UNDEF) continue; 10954 BaseShAmt = Arg; 10955 break; 10956 } 10957 for (; i != NumElts; ++i) { 10958 SDValue Arg = ShAmtOp.getOperand(i); 10959 if (Arg.getOpcode() == ISD::UNDEF) continue; 10960 if (Arg != BaseShAmt) { 10961 return SDValue(); 10962 } 10963 } 10964 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 10965 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 10966 SDValue InVec = ShAmtOp.getOperand(0); 10967 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 10968 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 10969 unsigned i = 0; 10970 for (; i != NumElts; ++i) { 10971 SDValue Arg = InVec.getOperand(i); 10972 if (Arg.getOpcode() == ISD::UNDEF) continue; 10973 BaseShAmt = Arg; 10974 break; 10975 } 10976 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 10977 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 10978 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 10979 if (C->getZExtValue() == SplatIdx) 10980 BaseShAmt = InVec.getOperand(1); 10981 } 10982 } 10983 if (BaseShAmt.getNode() == 0) 10984 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 10985 DAG.getIntPtrConstant(0)); 10986 } else 10987 return SDValue(); 10988 10989 // The shift amount is an i32. 10990 if (EltVT.bitsGT(MVT::i32)) 10991 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 10992 else if (EltVT.bitsLT(MVT::i32)) 10993 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 10994 10995 // The shift amount is identical so we can do a vector shift. 10996 SDValue ValOp = N->getOperand(0); 10997 switch (N->getOpcode()) { 10998 default: 10999 llvm_unreachable("Unknown shift opcode!"); 11000 break; 11001 case ISD::SHL: 11002 if (VT == MVT::v2i64) 11003 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11004 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 11005 ValOp, BaseShAmt); 11006 if (VT == MVT::v4i32) 11007 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11008 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 11009 ValOp, BaseShAmt); 11010 if (VT == MVT::v8i16) 11011 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11012 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 11013 ValOp, BaseShAmt); 11014 break; 11015 case ISD::SRA: 11016 if (VT == MVT::v4i32) 11017 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11018 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 11019 ValOp, BaseShAmt); 11020 if (VT == MVT::v8i16) 11021 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11022 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 11023 ValOp, BaseShAmt); 11024 break; 11025 case ISD::SRL: 11026 if (VT == MVT::v2i64) 11027 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11028 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 11029 ValOp, BaseShAmt); 11030 if (VT == MVT::v4i32) 11031 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11032 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 11033 ValOp, BaseShAmt); 11034 if (VT == MVT::v8i16) 11035 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 11036 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 11037 ValOp, BaseShAmt); 11038 break; 11039 } 11040 return SDValue(); 11041} 11042 11043 11044static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 11045 TargetLowering::DAGCombinerInfo &DCI, 11046 const X86Subtarget *Subtarget) { 11047 if (DCI.isBeforeLegalizeOps()) 11048 return SDValue(); 11049 11050 // Want to form PANDN nodes, in the hopes of then easily combining them with 11051 // OR and AND nodes to form PBLEND/PSIGN. 11052 EVT VT = N->getValueType(0); 11053 if (VT != MVT::v2i64) 11054 return SDValue(); 11055 11056 SDValue N0 = N->getOperand(0); 11057 SDValue N1 = N->getOperand(1); 11058 DebugLoc DL = N->getDebugLoc(); 11059 11060 // Check LHS for vnot 11061 if (N0.getOpcode() == ISD::XOR && 11062 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 11063 return DAG.getNode(X86ISD::PANDN, DL, VT, N0.getOperand(0), N1); 11064 11065 // Check RHS for vnot 11066 if (N1.getOpcode() == ISD::XOR && 11067 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 11068 return DAG.getNode(X86ISD::PANDN, DL, VT, N1.getOperand(0), N0); 11069 11070 return SDValue(); 11071} 11072 11073static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 11074 TargetLowering::DAGCombinerInfo &DCI, 11075 const X86Subtarget *Subtarget) { 11076 if (DCI.isBeforeLegalizeOps()) 11077 return SDValue(); 11078 11079 EVT VT = N->getValueType(0); 11080 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) 11081 return SDValue(); 11082 11083 SDValue N0 = N->getOperand(0); 11084 SDValue N1 = N->getOperand(1); 11085 11086 // look for psign/blend 11087 if (Subtarget->hasSSSE3()) { 11088 if (VT == MVT::v2i64) { 11089 // Canonicalize pandn to RHS 11090 if (N0.getOpcode() == X86ISD::PANDN) 11091 std::swap(N0, N1); 11092 // or (and (m, x), (pandn m, y)) 11093 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::PANDN) { 11094 SDValue Mask = N1.getOperand(0); 11095 SDValue X = N1.getOperand(1); 11096 SDValue Y; 11097 if (N0.getOperand(0) == Mask) 11098 Y = N0.getOperand(1); 11099 if (N0.getOperand(1) == Mask) 11100 Y = N0.getOperand(0); 11101 11102 // Check to see if the mask appeared in both the AND and PANDN and 11103 if (!Y.getNode()) 11104 return SDValue(); 11105 11106 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 11107 if (Mask.getOpcode() != ISD::BITCAST || 11108 X.getOpcode() != ISD::BITCAST || 11109 Y.getOpcode() != ISD::BITCAST) 11110 return SDValue(); 11111 11112 // Look through mask bitcast. 11113 Mask = Mask.getOperand(0); 11114 EVT MaskVT = Mask.getValueType(); 11115 11116 // Validate that the Mask operand is a vector sra node. The sra node 11117 // will be an intrinsic. 11118 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 11119 return SDValue(); 11120 11121 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 11122 // there is no psrai.b 11123 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 11124 case Intrinsic::x86_sse2_psrai_w: 11125 case Intrinsic::x86_sse2_psrai_d: 11126 break; 11127 default: return SDValue(); 11128 } 11129 11130 // Check that the SRA is all signbits. 11131 SDValue SraC = Mask.getOperand(2); 11132 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 11133 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 11134 if ((SraAmt + 1) != EltBits) 11135 return SDValue(); 11136 11137 DebugLoc DL = N->getDebugLoc(); 11138 11139 // Now we know we at least have a plendvb with the mask val. See if 11140 // we can form a psignb/w/d. 11141 // psign = x.type == y.type == mask.type && y = sub(0, x); 11142 X = X.getOperand(0); 11143 Y = Y.getOperand(0); 11144 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 11145 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 11146 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ 11147 unsigned Opc = 0; 11148 switch (EltBits) { 11149 case 8: Opc = X86ISD::PSIGNB; break; 11150 case 16: Opc = X86ISD::PSIGNW; break; 11151 case 32: Opc = X86ISD::PSIGND; break; 11152 default: break; 11153 } 11154 if (Opc) { 11155 SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); 11156 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); 11157 } 11158 } 11159 // PBLENDVB only available on SSE 4.1 11160 if (!Subtarget->hasSSE41()) 11161 return SDValue(); 11162 11163 unsigned IID = Intrinsic::x86_sse41_pblendvb; 11164 X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); 11165 Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); 11166 Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); 11167 Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8, 11168 DAG.getConstant(IID, MVT::i32), X, Y, Mask); 11169 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); 11170 } 11171 } 11172 } 11173 11174 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 11175 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 11176 std::swap(N0, N1); 11177 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 11178 return SDValue(); 11179 if (!N0.hasOneUse() || !N1.hasOneUse()) 11180 return SDValue(); 11181 11182 SDValue ShAmt0 = N0.getOperand(1); 11183 if (ShAmt0.getValueType() != MVT::i8) 11184 return SDValue(); 11185 SDValue ShAmt1 = N1.getOperand(1); 11186 if (ShAmt1.getValueType() != MVT::i8) 11187 return SDValue(); 11188 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 11189 ShAmt0 = ShAmt0.getOperand(0); 11190 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 11191 ShAmt1 = ShAmt1.getOperand(0); 11192 11193 DebugLoc DL = N->getDebugLoc(); 11194 unsigned Opc = X86ISD::SHLD; 11195 SDValue Op0 = N0.getOperand(0); 11196 SDValue Op1 = N1.getOperand(0); 11197 if (ShAmt0.getOpcode() == ISD::SUB) { 11198 Opc = X86ISD::SHRD; 11199 std::swap(Op0, Op1); 11200 std::swap(ShAmt0, ShAmt1); 11201 } 11202 11203 unsigned Bits = VT.getSizeInBits(); 11204 if (ShAmt1.getOpcode() == ISD::SUB) { 11205 SDValue Sum = ShAmt1.getOperand(0); 11206 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 11207 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 11208 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 11209 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 11210 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 11211 return DAG.getNode(Opc, DL, VT, 11212 Op0, Op1, 11213 DAG.getNode(ISD::TRUNCATE, DL, 11214 MVT::i8, ShAmt0)); 11215 } 11216 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 11217 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 11218 if (ShAmt0C && 11219 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 11220 return DAG.getNode(Opc, DL, VT, 11221 N0.getOperand(0), N1.getOperand(0), 11222 DAG.getNode(ISD::TRUNCATE, DL, 11223 MVT::i8, ShAmt0)); 11224 } 11225 11226 return SDValue(); 11227} 11228 11229/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 11230static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 11231 const X86Subtarget *Subtarget) { 11232 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 11233 // the FP state in cases where an emms may be missing. 11234 // A preferable solution to the general problem is to figure out the right 11235 // places to insert EMMS. This qualifies as a quick hack. 11236 11237 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 11238 StoreSDNode *St = cast<StoreSDNode>(N); 11239 EVT VT = St->getValue().getValueType(); 11240 if (VT.getSizeInBits() != 64) 11241 return SDValue(); 11242 11243 const Function *F = DAG.getMachineFunction().getFunction(); 11244 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 11245 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 11246 && Subtarget->hasSSE2(); 11247 if ((VT.isVector() || 11248 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 11249 isa<LoadSDNode>(St->getValue()) && 11250 !cast<LoadSDNode>(St->getValue())->isVolatile() && 11251 St->getChain().hasOneUse() && !St->isVolatile()) { 11252 SDNode* LdVal = St->getValue().getNode(); 11253 LoadSDNode *Ld = 0; 11254 int TokenFactorIndex = -1; 11255 SmallVector<SDValue, 8> Ops; 11256 SDNode* ChainVal = St->getChain().getNode(); 11257 // Must be a store of a load. We currently handle two cases: the load 11258 // is a direct child, and it's under an intervening TokenFactor. It is 11259 // possible to dig deeper under nested TokenFactors. 11260 if (ChainVal == LdVal) 11261 Ld = cast<LoadSDNode>(St->getChain()); 11262 else if (St->getValue().hasOneUse() && 11263 ChainVal->getOpcode() == ISD::TokenFactor) { 11264 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 11265 if (ChainVal->getOperand(i).getNode() == LdVal) { 11266 TokenFactorIndex = i; 11267 Ld = cast<LoadSDNode>(St->getValue()); 11268 } else 11269 Ops.push_back(ChainVal->getOperand(i)); 11270 } 11271 } 11272 11273 if (!Ld || !ISD::isNormalLoad(Ld)) 11274 return SDValue(); 11275 11276 // If this is not the MMX case, i.e. we are just turning i64 load/store 11277 // into f64 load/store, avoid the transformation if there are multiple 11278 // uses of the loaded value. 11279 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 11280 return SDValue(); 11281 11282 DebugLoc LdDL = Ld->getDebugLoc(); 11283 DebugLoc StDL = N->getDebugLoc(); 11284 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 11285 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 11286 // pair instead. 11287 if (Subtarget->is64Bit() || F64IsLegal) { 11288 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 11289 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 11290 Ld->getPointerInfo(), Ld->isVolatile(), 11291 Ld->isNonTemporal(), Ld->getAlignment()); 11292 SDValue NewChain = NewLd.getValue(1); 11293 if (TokenFactorIndex != -1) { 11294 Ops.push_back(NewChain); 11295 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11296 Ops.size()); 11297 } 11298 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 11299 St->getPointerInfo(), 11300 St->isVolatile(), St->isNonTemporal(), 11301 St->getAlignment()); 11302 } 11303 11304 // Otherwise, lower to two pairs of 32-bit loads / stores. 11305 SDValue LoAddr = Ld->getBasePtr(); 11306 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 11307 DAG.getConstant(4, MVT::i32)); 11308 11309 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 11310 Ld->getPointerInfo(), 11311 Ld->isVolatile(), Ld->isNonTemporal(), 11312 Ld->getAlignment()); 11313 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 11314 Ld->getPointerInfo().getWithOffset(4), 11315 Ld->isVolatile(), Ld->isNonTemporal(), 11316 MinAlign(Ld->getAlignment(), 4)); 11317 11318 SDValue NewChain = LoLd.getValue(1); 11319 if (TokenFactorIndex != -1) { 11320 Ops.push_back(LoLd); 11321 Ops.push_back(HiLd); 11322 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11323 Ops.size()); 11324 } 11325 11326 LoAddr = St->getBasePtr(); 11327 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 11328 DAG.getConstant(4, MVT::i32)); 11329 11330 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 11331 St->getPointerInfo(), 11332 St->isVolatile(), St->isNonTemporal(), 11333 St->getAlignment()); 11334 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 11335 St->getPointerInfo().getWithOffset(4), 11336 St->isVolatile(), 11337 St->isNonTemporal(), 11338 MinAlign(St->getAlignment(), 4)); 11339 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 11340 } 11341 return SDValue(); 11342} 11343 11344/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 11345/// X86ISD::FXOR nodes. 11346static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 11347 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 11348 // F[X]OR(0.0, x) -> x 11349 // F[X]OR(x, 0.0) -> x 11350 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11351 if (C->getValueAPF().isPosZero()) 11352 return N->getOperand(1); 11353 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11354 if (C->getValueAPF().isPosZero()) 11355 return N->getOperand(0); 11356 return SDValue(); 11357} 11358 11359/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 11360static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 11361 // FAND(0.0, x) -> 0.0 11362 // FAND(x, 0.0) -> 0.0 11363 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11364 if (C->getValueAPF().isPosZero()) 11365 return N->getOperand(0); 11366 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11367 if (C->getValueAPF().isPosZero()) 11368 return N->getOperand(1); 11369 return SDValue(); 11370} 11371 11372static SDValue PerformBTCombine(SDNode *N, 11373 SelectionDAG &DAG, 11374 TargetLowering::DAGCombinerInfo &DCI) { 11375 // BT ignores high bits in the bit index operand. 11376 SDValue Op1 = N->getOperand(1); 11377 if (Op1.hasOneUse()) { 11378 unsigned BitWidth = Op1.getValueSizeInBits(); 11379 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 11380 APInt KnownZero, KnownOne; 11381 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 11382 !DCI.isBeforeLegalizeOps()); 11383 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11384 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 11385 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 11386 DCI.CommitTargetLoweringOpt(TLO); 11387 } 11388 return SDValue(); 11389} 11390 11391static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 11392 SDValue Op = N->getOperand(0); 11393 if (Op.getOpcode() == ISD::BITCAST) 11394 Op = Op.getOperand(0); 11395 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 11396 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 11397 VT.getVectorElementType().getSizeInBits() == 11398 OpVT.getVectorElementType().getSizeInBits()) { 11399 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 11400 } 11401 return SDValue(); 11402} 11403 11404static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 11405 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 11406 // (and (i32 x86isd::setcc_carry), 1) 11407 // This eliminates the zext. This transformation is necessary because 11408 // ISD::SETCC is always legalized to i8. 11409 DebugLoc dl = N->getDebugLoc(); 11410 SDValue N0 = N->getOperand(0); 11411 EVT VT = N->getValueType(0); 11412 if (N0.getOpcode() == ISD::AND && 11413 N0.hasOneUse() && 11414 N0.getOperand(0).hasOneUse()) { 11415 SDValue N00 = N0.getOperand(0); 11416 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 11417 return SDValue(); 11418 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 11419 if (!C || C->getZExtValue() != 1) 11420 return SDValue(); 11421 return DAG.getNode(ISD::AND, dl, VT, 11422 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 11423 N00.getOperand(0), N00.getOperand(1)), 11424 DAG.getConstant(1, VT)); 11425 } 11426 11427 return SDValue(); 11428} 11429 11430// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 11431static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 11432 unsigned X86CC = N->getConstantOperandVal(0); 11433 SDValue EFLAG = N->getOperand(1); 11434 DebugLoc DL = N->getDebugLoc(); 11435 11436 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 11437 // a zext and produces an all-ones bit which is more useful than 0/1 in some 11438 // cases. 11439 if (X86CC == X86::COND_B) 11440 return DAG.getNode(ISD::AND, DL, MVT::i8, 11441 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 11442 DAG.getConstant(X86CC, MVT::i8), EFLAG), 11443 DAG.getConstant(1, MVT::i8)); 11444 11445 return SDValue(); 11446} 11447 11448SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 11449 DAGCombinerInfo &DCI) const { 11450 SelectionDAG &DAG = DCI.DAG; 11451 switch (N->getOpcode()) { 11452 default: break; 11453 case ISD::EXTRACT_VECTOR_ELT: 11454 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 11455 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 11456 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 11457 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 11458 case ISD::SHL: 11459 case ISD::SRA: 11460 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 11461 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 11462 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 11463 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 11464 case X86ISD::FXOR: 11465 case X86ISD::FOR: return PerformFORCombine(N, DAG); 11466 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 11467 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 11468 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 11469 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 11470 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 11471 case X86ISD::SHUFPS: // Handle all target specific shuffles 11472 case X86ISD::SHUFPD: 11473 case X86ISD::PALIGN: 11474 case X86ISD::PUNPCKHBW: 11475 case X86ISD::PUNPCKHWD: 11476 case X86ISD::PUNPCKHDQ: 11477 case X86ISD::PUNPCKHQDQ: 11478 case X86ISD::UNPCKHPS: 11479 case X86ISD::UNPCKHPD: 11480 case X86ISD::PUNPCKLBW: 11481 case X86ISD::PUNPCKLWD: 11482 case X86ISD::PUNPCKLDQ: 11483 case X86ISD::PUNPCKLQDQ: 11484 case X86ISD::UNPCKLPS: 11485 case X86ISD::UNPCKLPD: 11486 case X86ISD::MOVHLPS: 11487 case X86ISD::MOVLHPS: 11488 case X86ISD::PSHUFD: 11489 case X86ISD::PSHUFHW: 11490 case X86ISD::PSHUFLW: 11491 case X86ISD::MOVSS: 11492 case X86ISD::MOVSD: 11493 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); 11494 } 11495 11496 return SDValue(); 11497} 11498 11499/// isTypeDesirableForOp - Return true if the target has native support for 11500/// the specified value type and it is 'desirable' to use the type for the 11501/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 11502/// instruction encodings are longer and some i16 instructions are slow. 11503bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 11504 if (!isTypeLegal(VT)) 11505 return false; 11506 if (VT != MVT::i16) 11507 return true; 11508 11509 switch (Opc) { 11510 default: 11511 return true; 11512 case ISD::LOAD: 11513 case ISD::SIGN_EXTEND: 11514 case ISD::ZERO_EXTEND: 11515 case ISD::ANY_EXTEND: 11516 case ISD::SHL: 11517 case ISD::SRL: 11518 case ISD::SUB: 11519 case ISD::ADD: 11520 case ISD::MUL: 11521 case ISD::AND: 11522 case ISD::OR: 11523 case ISD::XOR: 11524 return false; 11525 } 11526} 11527 11528/// IsDesirableToPromoteOp - This method query the target whether it is 11529/// beneficial for dag combiner to promote the specified node. If true, it 11530/// should return the desired promotion type by reference. 11531bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 11532 EVT VT = Op.getValueType(); 11533 if (VT != MVT::i16) 11534 return false; 11535 11536 bool Promote = false; 11537 bool Commute = false; 11538 switch (Op.getOpcode()) { 11539 default: break; 11540 case ISD::LOAD: { 11541 LoadSDNode *LD = cast<LoadSDNode>(Op); 11542 // If the non-extending load has a single use and it's not live out, then it 11543 // might be folded. 11544 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 11545 Op.hasOneUse()*/) { 11546 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 11547 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 11548 // The only case where we'd want to promote LOAD (rather then it being 11549 // promoted as an operand is when it's only use is liveout. 11550 if (UI->getOpcode() != ISD::CopyToReg) 11551 return false; 11552 } 11553 } 11554 Promote = true; 11555 break; 11556 } 11557 case ISD::SIGN_EXTEND: 11558 case ISD::ZERO_EXTEND: 11559 case ISD::ANY_EXTEND: 11560 Promote = true; 11561 break; 11562 case ISD::SHL: 11563 case ISD::SRL: { 11564 SDValue N0 = Op.getOperand(0); 11565 // Look out for (store (shl (load), x)). 11566 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 11567 return false; 11568 Promote = true; 11569 break; 11570 } 11571 case ISD::ADD: 11572 case ISD::MUL: 11573 case ISD::AND: 11574 case ISD::OR: 11575 case ISD::XOR: 11576 Commute = true; 11577 // fallthrough 11578 case ISD::SUB: { 11579 SDValue N0 = Op.getOperand(0); 11580 SDValue N1 = Op.getOperand(1); 11581 if (!Commute && MayFoldLoad(N1)) 11582 return false; 11583 // Avoid disabling potential load folding opportunities. 11584 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 11585 return false; 11586 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 11587 return false; 11588 Promote = true; 11589 } 11590 } 11591 11592 PVT = MVT::i32; 11593 return Promote; 11594} 11595 11596//===----------------------------------------------------------------------===// 11597// X86 Inline Assembly Support 11598//===----------------------------------------------------------------------===// 11599 11600static bool LowerToBSwap(CallInst *CI) { 11601 // FIXME: this should verify that we are targetting a 486 or better. If not, 11602 // we will turn this bswap into something that will be lowered to logical ops 11603 // instead of emitting the bswap asm. For now, we don't support 486 or lower 11604 // so don't worry about this. 11605 11606 // Verify this is a simple bswap. 11607 if (CI->getNumArgOperands() != 1 || 11608 CI->getType() != CI->getArgOperand(0)->getType() || 11609 !CI->getType()->isIntegerTy()) 11610 return false; 11611 11612 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11613 if (!Ty || Ty->getBitWidth() % 16 != 0) 11614 return false; 11615 11616 // Okay, we can do this xform, do so now. 11617 const Type *Tys[] = { Ty }; 11618 Module *M = CI->getParent()->getParent()->getParent(); 11619 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 11620 11621 Value *Op = CI->getArgOperand(0); 11622 Op = CallInst::Create(Int, Op, CI->getName(), CI); 11623 11624 CI->replaceAllUsesWith(Op); 11625 CI->eraseFromParent(); 11626 return true; 11627} 11628 11629bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 11630 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 11631 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 11632 11633 std::string AsmStr = IA->getAsmString(); 11634 11635 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 11636 SmallVector<StringRef, 4> AsmPieces; 11637 SplitString(AsmStr, AsmPieces, ";\n"); 11638 11639 switch (AsmPieces.size()) { 11640 default: return false; 11641 case 1: 11642 AsmStr = AsmPieces[0]; 11643 AsmPieces.clear(); 11644 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 11645 11646 // bswap $0 11647 if (AsmPieces.size() == 2 && 11648 (AsmPieces[0] == "bswap" || 11649 AsmPieces[0] == "bswapq" || 11650 AsmPieces[0] == "bswapl") && 11651 (AsmPieces[1] == "$0" || 11652 AsmPieces[1] == "${0:q}")) { 11653 // No need to check constraints, nothing other than the equivalent of 11654 // "=r,0" would be valid here. 11655 return LowerToBSwap(CI); 11656 } 11657 // rorw $$8, ${0:w} --> llvm.bswap.i16 11658 if (CI->getType()->isIntegerTy(16) && 11659 AsmPieces.size() == 3 && 11660 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 11661 AsmPieces[1] == "$$8," && 11662 AsmPieces[2] == "${0:w}" && 11663 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 11664 AsmPieces.clear(); 11665 const std::string &Constraints = IA->getConstraintString(); 11666 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 11667 std::sort(AsmPieces.begin(), AsmPieces.end()); 11668 if (AsmPieces.size() == 4 && 11669 AsmPieces[0] == "~{cc}" && 11670 AsmPieces[1] == "~{dirflag}" && 11671 AsmPieces[2] == "~{flags}" && 11672 AsmPieces[3] == "~{fpsr}") { 11673 return LowerToBSwap(CI); 11674 } 11675 } 11676 break; 11677 case 3: 11678 if (CI->getType()->isIntegerTy(32) && 11679 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 11680 SmallVector<StringRef, 4> Words; 11681 SplitString(AsmPieces[0], Words, " \t,"); 11682 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 11683 Words[2] == "${0:w}") { 11684 Words.clear(); 11685 SplitString(AsmPieces[1], Words, " \t,"); 11686 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 11687 Words[2] == "$0") { 11688 Words.clear(); 11689 SplitString(AsmPieces[2], Words, " \t,"); 11690 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 11691 Words[2] == "${0:w}") { 11692 AsmPieces.clear(); 11693 const std::string &Constraints = IA->getConstraintString(); 11694 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 11695 std::sort(AsmPieces.begin(), AsmPieces.end()); 11696 if (AsmPieces.size() == 4 && 11697 AsmPieces[0] == "~{cc}" && 11698 AsmPieces[1] == "~{dirflag}" && 11699 AsmPieces[2] == "~{flags}" && 11700 AsmPieces[3] == "~{fpsr}") { 11701 return LowerToBSwap(CI); 11702 } 11703 } 11704 } 11705 } 11706 } 11707 if (CI->getType()->isIntegerTy(64) && 11708 Constraints.size() >= 2 && 11709 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 11710 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 11711 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 11712 SmallVector<StringRef, 4> Words; 11713 SplitString(AsmPieces[0], Words, " \t"); 11714 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 11715 Words.clear(); 11716 SplitString(AsmPieces[1], Words, " \t"); 11717 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 11718 Words.clear(); 11719 SplitString(AsmPieces[2], Words, " \t,"); 11720 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 11721 Words[2] == "%edx") { 11722 return LowerToBSwap(CI); 11723 } 11724 } 11725 } 11726 } 11727 break; 11728 } 11729 return false; 11730} 11731 11732 11733 11734/// getConstraintType - Given a constraint letter, return the type of 11735/// constraint it is for this target. 11736X86TargetLowering::ConstraintType 11737X86TargetLowering::getConstraintType(const std::string &Constraint) const { 11738 if (Constraint.size() == 1) { 11739 switch (Constraint[0]) { 11740 case 'R': 11741 case 'q': 11742 case 'Q': 11743 case 'f': 11744 case 't': 11745 case 'u': 11746 case 'y': 11747 case 'x': 11748 case 'Y': 11749 return C_RegisterClass; 11750 case 'a': 11751 case 'b': 11752 case 'c': 11753 case 'd': 11754 case 'S': 11755 case 'D': 11756 case 'A': 11757 return C_Register; 11758 case 'I': 11759 case 'J': 11760 case 'K': 11761 case 'L': 11762 case 'M': 11763 case 'N': 11764 case 'G': 11765 case 'C': 11766 case 'e': 11767 case 'Z': 11768 return C_Other; 11769 default: 11770 break; 11771 } 11772 } 11773 return TargetLowering::getConstraintType(Constraint); 11774} 11775 11776/// Examine constraint type and operand type and determine a weight value. 11777/// This object must already have been set up with the operand type 11778/// and the current alternative constraint selected. 11779TargetLowering::ConstraintWeight 11780 X86TargetLowering::getSingleConstraintMatchWeight( 11781 AsmOperandInfo &info, const char *constraint) const { 11782 ConstraintWeight weight = CW_Invalid; 11783 Value *CallOperandVal = info.CallOperandVal; 11784 // If we don't have a value, we can't do a match, 11785 // but allow it at the lowest weight. 11786 if (CallOperandVal == NULL) 11787 return CW_Default; 11788 const Type *type = CallOperandVal->getType(); 11789 // Look at the constraint type. 11790 switch (*constraint) { 11791 default: 11792 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 11793 case 'R': 11794 case 'q': 11795 case 'Q': 11796 case 'a': 11797 case 'b': 11798 case 'c': 11799 case 'd': 11800 case 'S': 11801 case 'D': 11802 case 'A': 11803 if (CallOperandVal->getType()->isIntegerTy()) 11804 weight = CW_SpecificReg; 11805 break; 11806 case 'f': 11807 case 't': 11808 case 'u': 11809 if (type->isFloatingPointTy()) 11810 weight = CW_SpecificReg; 11811 break; 11812 case 'y': 11813 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 11814 weight = CW_SpecificReg; 11815 break; 11816 case 'x': 11817 case 'Y': 11818 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 11819 weight = CW_Register; 11820 break; 11821 case 'I': 11822 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 11823 if (C->getZExtValue() <= 31) 11824 weight = CW_Constant; 11825 } 11826 break; 11827 case 'J': 11828 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11829 if (C->getZExtValue() <= 63) 11830 weight = CW_Constant; 11831 } 11832 break; 11833 case 'K': 11834 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11835 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 11836 weight = CW_Constant; 11837 } 11838 break; 11839 case 'L': 11840 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11841 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 11842 weight = CW_Constant; 11843 } 11844 break; 11845 case 'M': 11846 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11847 if (C->getZExtValue() <= 3) 11848 weight = CW_Constant; 11849 } 11850 break; 11851 case 'N': 11852 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11853 if (C->getZExtValue() <= 0xff) 11854 weight = CW_Constant; 11855 } 11856 break; 11857 case 'G': 11858 case 'C': 11859 if (dyn_cast<ConstantFP>(CallOperandVal)) { 11860 weight = CW_Constant; 11861 } 11862 break; 11863 case 'e': 11864 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11865 if ((C->getSExtValue() >= -0x80000000LL) && 11866 (C->getSExtValue() <= 0x7fffffffLL)) 11867 weight = CW_Constant; 11868 } 11869 break; 11870 case 'Z': 11871 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11872 if (C->getZExtValue() <= 0xffffffff) 11873 weight = CW_Constant; 11874 } 11875 break; 11876 } 11877 return weight; 11878} 11879 11880/// LowerXConstraint - try to replace an X constraint, which matches anything, 11881/// with another that has more specific requirements based on the type of the 11882/// corresponding operand. 11883const char *X86TargetLowering:: 11884LowerXConstraint(EVT ConstraintVT) const { 11885 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 11886 // 'f' like normal targets. 11887 if (ConstraintVT.isFloatingPoint()) { 11888 if (Subtarget->hasXMMInt()) 11889 return "Y"; 11890 if (Subtarget->hasXMM()) 11891 return "x"; 11892 } 11893 11894 return TargetLowering::LowerXConstraint(ConstraintVT); 11895} 11896 11897/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 11898/// vector. If it is invalid, don't add anything to Ops. 11899void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 11900 char Constraint, 11901 std::vector<SDValue>&Ops, 11902 SelectionDAG &DAG) const { 11903 SDValue Result(0, 0); 11904 11905 switch (Constraint) { 11906 default: break; 11907 case 'I': 11908 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11909 if (C->getZExtValue() <= 31) { 11910 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11911 break; 11912 } 11913 } 11914 return; 11915 case 'J': 11916 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11917 if (C->getZExtValue() <= 63) { 11918 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11919 break; 11920 } 11921 } 11922 return; 11923 case 'K': 11924 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11925 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 11926 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11927 break; 11928 } 11929 } 11930 return; 11931 case 'N': 11932 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11933 if (C->getZExtValue() <= 255) { 11934 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11935 break; 11936 } 11937 } 11938 return; 11939 case 'e': { 11940 // 32-bit signed value 11941 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11942 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 11943 C->getSExtValue())) { 11944 // Widen to 64 bits here to get it sign extended. 11945 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 11946 break; 11947 } 11948 // FIXME gcc accepts some relocatable values here too, but only in certain 11949 // memory models; it's complicated. 11950 } 11951 return; 11952 } 11953 case 'Z': { 11954 // 32-bit unsigned value 11955 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11956 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 11957 C->getZExtValue())) { 11958 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11959 break; 11960 } 11961 } 11962 // FIXME gcc accepts some relocatable values here too, but only in certain 11963 // memory models; it's complicated. 11964 return; 11965 } 11966 case 'i': { 11967 // Literal immediates are always ok. 11968 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 11969 // Widen to 64 bits here to get it sign extended. 11970 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 11971 break; 11972 } 11973 11974 // In any sort of PIC mode addresses need to be computed at runtime by 11975 // adding in a register or some sort of table lookup. These can't 11976 // be used as immediates. 11977 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 11978 return; 11979 11980 // If we are in non-pic codegen mode, we allow the address of a global (with 11981 // an optional displacement) to be used with 'i'. 11982 GlobalAddressSDNode *GA = 0; 11983 int64_t Offset = 0; 11984 11985 // Match either (GA), (GA+C), (GA+C1+C2), etc. 11986 while (1) { 11987 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 11988 Offset += GA->getOffset(); 11989 break; 11990 } else if (Op.getOpcode() == ISD::ADD) { 11991 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 11992 Offset += C->getZExtValue(); 11993 Op = Op.getOperand(0); 11994 continue; 11995 } 11996 } else if (Op.getOpcode() == ISD::SUB) { 11997 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 11998 Offset += -C->getZExtValue(); 11999 Op = Op.getOperand(0); 12000 continue; 12001 } 12002 } 12003 12004 // Otherwise, this isn't something we can handle, reject it. 12005 return; 12006 } 12007 12008 const GlobalValue *GV = GA->getGlobal(); 12009 // If we require an extra load to get this address, as in PIC mode, we 12010 // can't accept it. 12011 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 12012 getTargetMachine()))) 12013 return; 12014 12015 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 12016 GA->getValueType(0), Offset); 12017 break; 12018 } 12019 } 12020 12021 if (Result.getNode()) { 12022 Ops.push_back(Result); 12023 return; 12024 } 12025 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 12026} 12027 12028std::vector<unsigned> X86TargetLowering:: 12029getRegClassForInlineAsmConstraint(const std::string &Constraint, 12030 EVT VT) const { 12031 if (Constraint.size() == 1) { 12032 // FIXME: not handling fp-stack yet! 12033 switch (Constraint[0]) { // GCC X86 Constraint Letters 12034 default: break; // Unknown constraint letter 12035 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 12036 if (Subtarget->is64Bit()) { 12037 if (VT == MVT::i32) 12038 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 12039 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 12040 X86::R10D,X86::R11D,X86::R12D, 12041 X86::R13D,X86::R14D,X86::R15D, 12042 X86::EBP, X86::ESP, 0); 12043 else if (VT == MVT::i16) 12044 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 12045 X86::SI, X86::DI, X86::R8W,X86::R9W, 12046 X86::R10W,X86::R11W,X86::R12W, 12047 X86::R13W,X86::R14W,X86::R15W, 12048 X86::BP, X86::SP, 0); 12049 else if (VT == MVT::i8) 12050 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 12051 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 12052 X86::R10B,X86::R11B,X86::R12B, 12053 X86::R13B,X86::R14B,X86::R15B, 12054 X86::BPL, X86::SPL, 0); 12055 12056 else if (VT == MVT::i64) 12057 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 12058 X86::RSI, X86::RDI, X86::R8, X86::R9, 12059 X86::R10, X86::R11, X86::R12, 12060 X86::R13, X86::R14, X86::R15, 12061 X86::RBP, X86::RSP, 0); 12062 12063 break; 12064 } 12065 // 32-bit fallthrough 12066 case 'Q': // Q_REGS 12067 if (VT == MVT::i32) 12068 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 12069 else if (VT == MVT::i16) 12070 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 12071 else if (VT == MVT::i8) 12072 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 12073 else if (VT == MVT::i64) 12074 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 12075 break; 12076 } 12077 } 12078 12079 return std::vector<unsigned>(); 12080} 12081 12082std::pair<unsigned, const TargetRegisterClass*> 12083X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 12084 EVT VT) const { 12085 // First, see if this is a constraint that directly corresponds to an LLVM 12086 // register class. 12087 if (Constraint.size() == 1) { 12088 // GCC Constraint Letters 12089 switch (Constraint[0]) { 12090 default: break; 12091 case 'r': // GENERAL_REGS 12092 case 'l': // INDEX_REGS 12093 if (VT == MVT::i8) 12094 return std::make_pair(0U, X86::GR8RegisterClass); 12095 if (VT == MVT::i16) 12096 return std::make_pair(0U, X86::GR16RegisterClass); 12097 if (VT == MVT::i32 || !Subtarget->is64Bit()) 12098 return std::make_pair(0U, X86::GR32RegisterClass); 12099 return std::make_pair(0U, X86::GR64RegisterClass); 12100 case 'R': // LEGACY_REGS 12101 if (VT == MVT::i8) 12102 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 12103 if (VT == MVT::i16) 12104 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 12105 if (VT == MVT::i32 || !Subtarget->is64Bit()) 12106 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 12107 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 12108 case 'f': // FP Stack registers. 12109 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 12110 // value to the correct fpstack register class. 12111 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 12112 return std::make_pair(0U, X86::RFP32RegisterClass); 12113 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 12114 return std::make_pair(0U, X86::RFP64RegisterClass); 12115 return std::make_pair(0U, X86::RFP80RegisterClass); 12116 case 'y': // MMX_REGS if MMX allowed. 12117 if (!Subtarget->hasMMX()) break; 12118 return std::make_pair(0U, X86::VR64RegisterClass); 12119 case 'Y': // SSE_REGS if SSE2 allowed 12120 if (!Subtarget->hasXMMInt()) break; 12121 // FALL THROUGH. 12122 case 'x': // SSE_REGS if SSE1 allowed 12123 if (!Subtarget->hasXMM()) break; 12124 12125 switch (VT.getSimpleVT().SimpleTy) { 12126 default: break; 12127 // Scalar SSE types. 12128 case MVT::f32: 12129 case MVT::i32: 12130 return std::make_pair(0U, X86::FR32RegisterClass); 12131 case MVT::f64: 12132 case MVT::i64: 12133 return std::make_pair(0U, X86::FR64RegisterClass); 12134 // Vector types. 12135 case MVT::v16i8: 12136 case MVT::v8i16: 12137 case MVT::v4i32: 12138 case MVT::v2i64: 12139 case MVT::v4f32: 12140 case MVT::v2f64: 12141 return std::make_pair(0U, X86::VR128RegisterClass); 12142 } 12143 break; 12144 } 12145 } 12146 12147 // Use the default implementation in TargetLowering to convert the register 12148 // constraint into a member of a register class. 12149 std::pair<unsigned, const TargetRegisterClass*> Res; 12150 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 12151 12152 // Not found as a standard register? 12153 if (Res.second == 0) { 12154 // Map st(0) -> st(7) -> ST0 12155 if (Constraint.size() == 7 && Constraint[0] == '{' && 12156 tolower(Constraint[1]) == 's' && 12157 tolower(Constraint[2]) == 't' && 12158 Constraint[3] == '(' && 12159 (Constraint[4] >= '0' && Constraint[4] <= '7') && 12160 Constraint[5] == ')' && 12161 Constraint[6] == '}') { 12162 12163 Res.first = X86::ST0+Constraint[4]-'0'; 12164 Res.second = X86::RFP80RegisterClass; 12165 return Res; 12166 } 12167 12168 // GCC allows "st(0)" to be called just plain "st". 12169 if (StringRef("{st}").equals_lower(Constraint)) { 12170 Res.first = X86::ST0; 12171 Res.second = X86::RFP80RegisterClass; 12172 return Res; 12173 } 12174 12175 // flags -> EFLAGS 12176 if (StringRef("{flags}").equals_lower(Constraint)) { 12177 Res.first = X86::EFLAGS; 12178 Res.second = X86::CCRRegisterClass; 12179 return Res; 12180 } 12181 12182 // 'A' means EAX + EDX. 12183 if (Constraint == "A") { 12184 Res.first = X86::EAX; 12185 Res.second = X86::GR32_ADRegisterClass; 12186 return Res; 12187 } 12188 return Res; 12189 } 12190 12191 // Otherwise, check to see if this is a register class of the wrong value 12192 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 12193 // turn into {ax},{dx}. 12194 if (Res.second->hasType(VT)) 12195 return Res; // Correct type already, nothing to do. 12196 12197 // All of the single-register GCC register classes map their values onto 12198 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 12199 // really want an 8-bit or 32-bit register, map to the appropriate register 12200 // class and return the appropriate register. 12201 if (Res.second == X86::GR16RegisterClass) { 12202 if (VT == MVT::i8) { 12203 unsigned DestReg = 0; 12204 switch (Res.first) { 12205 default: break; 12206 case X86::AX: DestReg = X86::AL; break; 12207 case X86::DX: DestReg = X86::DL; break; 12208 case X86::CX: DestReg = X86::CL; break; 12209 case X86::BX: DestReg = X86::BL; break; 12210 } 12211 if (DestReg) { 12212 Res.first = DestReg; 12213 Res.second = X86::GR8RegisterClass; 12214 } 12215 } else if (VT == MVT::i32) { 12216 unsigned DestReg = 0; 12217 switch (Res.first) { 12218 default: break; 12219 case X86::AX: DestReg = X86::EAX; break; 12220 case X86::DX: DestReg = X86::EDX; break; 12221 case X86::CX: DestReg = X86::ECX; break; 12222 case X86::BX: DestReg = X86::EBX; break; 12223 case X86::SI: DestReg = X86::ESI; break; 12224 case X86::DI: DestReg = X86::EDI; break; 12225 case X86::BP: DestReg = X86::EBP; break; 12226 case X86::SP: DestReg = X86::ESP; break; 12227 } 12228 if (DestReg) { 12229 Res.first = DestReg; 12230 Res.second = X86::GR32RegisterClass; 12231 } 12232 } else if (VT == MVT::i64) { 12233 unsigned DestReg = 0; 12234 switch (Res.first) { 12235 default: break; 12236 case X86::AX: DestReg = X86::RAX; break; 12237 case X86::DX: DestReg = X86::RDX; break; 12238 case X86::CX: DestReg = X86::RCX; break; 12239 case X86::BX: DestReg = X86::RBX; break; 12240 case X86::SI: DestReg = X86::RSI; break; 12241 case X86::DI: DestReg = X86::RDI; break; 12242 case X86::BP: DestReg = X86::RBP; break; 12243 case X86::SP: DestReg = X86::RSP; break; 12244 } 12245 if (DestReg) { 12246 Res.first = DestReg; 12247 Res.second = X86::GR64RegisterClass; 12248 } 12249 } 12250 } else if (Res.second == X86::FR32RegisterClass || 12251 Res.second == X86::FR64RegisterClass || 12252 Res.second == X86::VR128RegisterClass) { 12253 // Handle references to XMM physical registers that got mapped into the 12254 // wrong class. This can happen with constraints like {xmm0} where the 12255 // target independent register mapper will just pick the first match it can 12256 // find, ignoring the required type. 12257 if (VT == MVT::f32) 12258 Res.second = X86::FR32RegisterClass; 12259 else if (VT == MVT::f64) 12260 Res.second = X86::FR64RegisterClass; 12261 else if (X86::VR128RegisterClass->hasType(VT)) 12262 Res.second = X86::VR128RegisterClass; 12263 } 12264 12265 return Res; 12266} 12267