X86ISelLowering.cpp revision 3efc0778c9baacc3dff4c50b62396d14f2420ba5
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Forward declarations. 61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 62 SDValue V2); 63 64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 65 66 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 67 68 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 69 if (is64Bit) return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 72 if (is64Bit) return new X8664_ELFTargetObjectFile(TM); 73 return new X8632_ELFTargetObjectFile(TM); 74 } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { 75 return new TargetLoweringObjectFileCOFF(); 76 } 77 llvm_unreachable("unknown subtarget type"); 78} 79 80X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 81 : TargetLowering(TM, createTLOF(TM)) { 82 Subtarget = &TM.getSubtarget<X86Subtarget>(); 83 X86ScalarSSEf64 = Subtarget->hasSSE2(); 84 X86ScalarSSEf32 = Subtarget->hasSSE1(); 85 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 86 87 RegInfo = TM.getRegisterInfo(); 88 TD = getTargetData(); 89 90 // Set up the TargetLowering object. 91 92 // X86 is weird, it always uses i8 for shift amounts and setcc results. 93 setShiftAmountType(MVT::i8); 94 setBooleanContents(ZeroOrOneBooleanContent); 95 setSchedulingPreference(Sched::RegPressure); 96 setStackPointerRegisterToSaveRestore(X86StackPtr); 97 98 if (Subtarget->isTargetDarwin()) { 99 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 100 setUseUnderscoreSetJmp(false); 101 setUseUnderscoreLongJmp(false); 102 } else if (Subtarget->isTargetMingw()) { 103 // MS runtime is weird: it exports _setjmp, but longjmp! 104 setUseUnderscoreSetJmp(true); 105 setUseUnderscoreLongJmp(false); 106 } else { 107 setUseUnderscoreSetJmp(true); 108 setUseUnderscoreLongJmp(true); 109 } 110 111 // Set up the register classes. 112 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 123 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 124 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 125 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 126 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 127 128 // SETOEQ and SETUNE require checking two conditions. 129 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 130 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 131 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 132 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 133 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 135 136 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 137 // operation. 138 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 139 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 140 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 141 142 if (Subtarget->is64Bit()) { 143 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 144 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 145 } else if (!UseSoftFloat) { 146 // We have an algorithm for SSE2->double, and we turn this into a 147 // 64-bit FILD followed by conditional FADD for other targets. 148 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 149 // We have an algorithm for SSE2, and we turn this into a 64-bit 150 // FILD for other targets. 151 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 152 } 153 154 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 155 // this operation. 156 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 157 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 158 159 if (!UseSoftFloat) { 160 // SSE has no i16 to fp conversion, only i32 161 if (X86ScalarSSEf32) { 162 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 163 // f32 and f64 cases are Legal, f80 case is not 164 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 165 } else { 166 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } 169 } else { 170 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 171 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 172 } 173 174 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 175 // are Legal, f80 is custom lowered. 176 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 177 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 178 179 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 180 // this operation. 181 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 182 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 183 184 if (X86ScalarSSEf32) { 185 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 186 // f32 and f64 cases are Legal, f80 case is not 187 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 188 } else { 189 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } 192 193 // Handle FP_TO_UINT by promoting the destination to a larger signed 194 // conversion. 195 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 196 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 197 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 198 199 if (Subtarget->is64Bit()) { 200 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 201 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 202 } else if (!UseSoftFloat) { 203 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 204 // Expand FP_TO_UINT into a select. 205 // FIXME: We would like to use a Custom expander here eventually to do 206 // the optimal thing for SSE vs. the default expansion in the legalizer. 207 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 208 else 209 // With SSE3 we can use fisttpll to convert to a signed i64; without 210 // SSE, we're stuck with a fistpll. 211 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 212 } 213 214 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 215 if (!X86ScalarSSEf64) { 216 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 217 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 218 if (Subtarget->is64Bit()) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 220 // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. 221 if (Subtarget->hasMMX() && !DisableMMX) 222 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); 223 else 224 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 225 } 226 } 227 228 // Scalar integer divide and remainder are lowered to use operations that 229 // produce two results, to match the available instructions. This exposes 230 // the two-result form to trivial CSE, which is able to combine x/y and x%y 231 // into a single instruction. 232 // 233 // Scalar integer multiply-high is also lowered to use two-result 234 // operations, to match the available instructions. However, plain multiply 235 // (low) operations are left as Legal, as there are single-result 236 // instructions for this in x86. Using the two-result multiply instructions 237 // when both high and low results are needed must be arranged by dagcombine. 238 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 240 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 241 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 242 setOperationAction(ISD::SREM , MVT::i8 , Expand); 243 setOperationAction(ISD::UREM , MVT::i8 , Expand); 244 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 246 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 247 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 248 setOperationAction(ISD::SREM , MVT::i16 , Expand); 249 setOperationAction(ISD::UREM , MVT::i16 , Expand); 250 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 252 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 253 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 254 setOperationAction(ISD::SREM , MVT::i32 , Expand); 255 setOperationAction(ISD::UREM , MVT::i32 , Expand); 256 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 257 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 258 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 259 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 260 setOperationAction(ISD::SREM , MVT::i64 , Expand); 261 setOperationAction(ISD::UREM , MVT::i64 , Expand); 262 263 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 264 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 265 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 266 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 267 if (Subtarget->is64Bit()) 268 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 269 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 270 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 271 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 272 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 273 setOperationAction(ISD::FREM , MVT::f32 , Expand); 274 setOperationAction(ISD::FREM , MVT::f64 , Expand); 275 setOperationAction(ISD::FREM , MVT::f80 , Expand); 276 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 277 278 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 279 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 280 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 281 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 282 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 301 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 302 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 303 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 304 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 305 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 306 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 307 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 308 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 309 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 310 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 314 } 315 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 316 317 // Darwin ABI issue. 318 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 319 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 320 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 321 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 322 if (Subtarget->is64Bit()) 323 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 324 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 325 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 326 if (Subtarget->is64Bit()) { 327 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 328 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 329 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 330 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 331 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 332 } 333 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 334 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 335 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 336 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 337 if (Subtarget->is64Bit()) { 338 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 339 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 340 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 341 } 342 343 if (Subtarget->hasSSE1()) 344 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 345 346 // We may not have a libcall for MEMBARRIER so we should lower this. 347 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 348 349 // On X86 and X86-64, atomic operations are lowered to locked instructions. 350 // Locked instructions, in turn, have implicit fence semantics (all memory 351 // operations are flushed before issuing the locked instruction, and they 352 // are not buffered), so we can fold away the common pattern of 353 // fence-atomic-fence. 354 setShouldFoldAtomicFences(true); 355 356 // Expand certain atomics 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 358 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 359 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 360 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 361 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 363 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 364 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 365 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 366 367 if (!Subtarget->is64Bit()) { 368 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 374 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 375 } 376 377 // FIXME - use subtarget debug flags 378 if (!Subtarget->isTargetDarwin() && 379 !Subtarget->isTargetELF() && 380 !Subtarget->isTargetCygMing()) { 381 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 382 } 383 384 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 385 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 386 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 387 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 388 if (Subtarget->is64Bit()) { 389 setExceptionPointerRegister(X86::RAX); 390 setExceptionSelectorRegister(X86::RDX); 391 } else { 392 setExceptionPointerRegister(X86::EAX); 393 setExceptionSelectorRegister(X86::EDX); 394 } 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 396 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 397 398 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 399 400 setOperationAction(ISD::TRAP, MVT::Other, Legal); 401 402 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 403 setOperationAction(ISD::VASTART , MVT::Other, Custom); 404 setOperationAction(ISD::VAEND , MVT::Other, Expand); 405 if (Subtarget->is64Bit()) { 406 setOperationAction(ISD::VAARG , MVT::Other, Custom); 407 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 408 } else { 409 setOperationAction(ISD::VAARG , MVT::Other, Expand); 410 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 411 } 412 413 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 414 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 415 if (Subtarget->is64Bit()) 416 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 417 if (Subtarget->isTargetCygMing()) 418 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 419 else 420 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 421 422 if (!UseSoftFloat && X86ScalarSSEf64) { 423 // f32 and f64 use SSE. 424 // Set up the FP register classes. 425 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 426 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 427 428 // Use ANDPD to simulate FABS. 429 setOperationAction(ISD::FABS , MVT::f64, Custom); 430 setOperationAction(ISD::FABS , MVT::f32, Custom); 431 432 // Use XORP to simulate FNEG. 433 setOperationAction(ISD::FNEG , MVT::f64, Custom); 434 setOperationAction(ISD::FNEG , MVT::f32, Custom); 435 436 // Use ANDPD and ORPD to simulate FCOPYSIGN. 437 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 438 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 439 440 // We don't support sin/cos/fmod 441 setOperationAction(ISD::FSIN , MVT::f64, Expand); 442 setOperationAction(ISD::FCOS , MVT::f64, Expand); 443 setOperationAction(ISD::FSIN , MVT::f32, Expand); 444 setOperationAction(ISD::FCOS , MVT::f32, Expand); 445 446 // Expand FP immediates into loads from the stack, except for the special 447 // cases we handle. 448 addLegalFPImmediate(APFloat(+0.0)); // xorpd 449 addLegalFPImmediate(APFloat(+0.0f)); // xorps 450 } else if (!UseSoftFloat && X86ScalarSSEf32) { 451 // Use SSE for f32, x87 for f64. 452 // Set up the FP register classes. 453 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 454 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 455 456 // Use ANDPS to simulate FABS. 457 setOperationAction(ISD::FABS , MVT::f32, Custom); 458 459 // Use XORP to simulate FNEG. 460 setOperationAction(ISD::FNEG , MVT::f32, Custom); 461 462 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 463 464 // Use ANDPS and ORPS to simulate FCOPYSIGN. 465 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 466 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 467 468 // We don't support sin/cos/fmod 469 setOperationAction(ISD::FSIN , MVT::f32, Expand); 470 setOperationAction(ISD::FCOS , MVT::f32, Expand); 471 472 // Special cases we handle for FP constants. 473 addLegalFPImmediate(APFloat(+0.0f)); // xorps 474 addLegalFPImmediate(APFloat(+0.0)); // FLD0 475 addLegalFPImmediate(APFloat(+1.0)); // FLD1 476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 478 479 if (!UnsafeFPMath) { 480 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 481 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 482 } 483 } else if (!UseSoftFloat) { 484 // f32 and f64 in x87. 485 // Set up the FP register classes. 486 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 487 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 488 489 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 490 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 492 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 493 494 if (!UnsafeFPMath) { 495 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 496 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 497 } 498 addLegalFPImmediate(APFloat(+0.0)); // FLD0 499 addLegalFPImmediate(APFloat(+1.0)); // FLD1 500 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 501 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 502 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 503 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 504 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 505 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 506 } 507 508 // Long double always uses X87. 509 if (!UseSoftFloat) { 510 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 511 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 512 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 513 { 514 bool ignored; 515 APFloat TmpFlt(+0.0); 516 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 517 &ignored); 518 addLegalFPImmediate(TmpFlt); // FLD0 519 TmpFlt.changeSign(); 520 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 521 APFloat TmpFlt2(+1.0); 522 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 523 &ignored); 524 addLegalFPImmediate(TmpFlt2); // FLD1 525 TmpFlt2.changeSign(); 526 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 527 } 528 529 if (!UnsafeFPMath) { 530 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 531 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 532 } 533 } 534 535 // Always use a library call for pow. 536 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 537 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 538 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 539 540 setOperationAction(ISD::FLOG, MVT::f80, Expand); 541 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 542 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 543 setOperationAction(ISD::FEXP, MVT::f80, Expand); 544 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 545 546 // First set operation action for all vector types to either promote 547 // (for widening) or expand (for scalarization). Then we will selectively 548 // turn on ones that can be effectively codegen'd. 549 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 550 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 551 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 566 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 567 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 600 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 604 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 605 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 606 setTruncStoreAction((MVT::SimpleValueType)VT, 607 (MVT::SimpleValueType)InnerVT, Expand); 608 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 609 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 610 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 611 } 612 613 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 614 // with -msoft-float, disable use of MMX as well. 615 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 616 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); 617 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); 618 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); 619 620 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); 621 622 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 623 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 624 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 625 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 626 627 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 628 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 629 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 630 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 631 632 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 633 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 634 635 setOperationAction(ISD::AND, MVT::v8i8, Promote); 636 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 637 setOperationAction(ISD::AND, MVT::v4i16, Promote); 638 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 639 setOperationAction(ISD::AND, MVT::v2i32, Promote); 640 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 641 setOperationAction(ISD::AND, MVT::v1i64, Legal); 642 643 setOperationAction(ISD::OR, MVT::v8i8, Promote); 644 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 645 setOperationAction(ISD::OR, MVT::v4i16, Promote); 646 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 647 setOperationAction(ISD::OR, MVT::v2i32, Promote); 648 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 649 setOperationAction(ISD::OR, MVT::v1i64, Legal); 650 651 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 652 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 653 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 654 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 655 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 656 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 657 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 658 659 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 660 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 661 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 662 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 663 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 664 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 665 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 666 667 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 669 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 670 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 671 672 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 674 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 675 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 676 677 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 678 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 679 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 680 681 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 682 683 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 684 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 685 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 686 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 689 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 690 691 if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { 692 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); 693 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); 694 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); 695 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); 696 } 697 } 698 699 if (!UseSoftFloat && Subtarget->hasSSE1()) { 700 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 701 702 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 703 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 704 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 705 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 706 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 707 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 708 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 709 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 711 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 712 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 713 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 714 } 715 716 if (!UseSoftFloat && Subtarget->hasSSE2()) { 717 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 718 719 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 720 // registers cannot be used even for integer operations. 721 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 722 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 723 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 724 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 725 726 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 727 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 728 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 729 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 730 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 731 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 732 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 733 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 734 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 735 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 736 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 737 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 738 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 739 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 740 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 741 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 742 743 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 744 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 745 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 746 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 747 748 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 750 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 751 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 752 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 753 754 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 755 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 756 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 757 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 758 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 759 760 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 761 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 762 EVT VT = (MVT::SimpleValueType)i; 763 // Do not attempt to custom lower non-power-of-2 vectors 764 if (!isPowerOf2_32(VT.getVectorNumElements())) 765 continue; 766 // Do not attempt to custom lower non-128-bit vectors 767 if (!VT.is128BitVector()) 768 continue; 769 setOperationAction(ISD::BUILD_VECTOR, 770 VT.getSimpleVT().SimpleTy, Custom); 771 setOperationAction(ISD::VECTOR_SHUFFLE, 772 VT.getSimpleVT().SimpleTy, Custom); 773 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 774 VT.getSimpleVT().SimpleTy, Custom); 775 } 776 777 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 778 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 779 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 780 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 781 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 782 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 783 784 if (Subtarget->is64Bit()) { 785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 787 } 788 789 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 790 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 791 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 792 EVT VT = SVT; 793 794 // Do not attempt to promote non-128-bit vectors 795 if (!VT.is128BitVector()) 796 continue; 797 798 setOperationAction(ISD::AND, SVT, Promote); 799 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 800 setOperationAction(ISD::OR, SVT, Promote); 801 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 802 setOperationAction(ISD::XOR, SVT, Promote); 803 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 804 setOperationAction(ISD::LOAD, SVT, Promote); 805 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 806 setOperationAction(ISD::SELECT, SVT, Promote); 807 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 808 } 809 810 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 811 812 // Custom lower v2i64 and v2f64 selects. 813 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 814 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 815 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 816 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 817 818 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 819 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 820 if (!DisableMMX && Subtarget->hasMMX()) { 821 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 822 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 823 } 824 } 825 826 if (Subtarget->hasSSE41()) { 827 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 828 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 829 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 830 setOperationAction(ISD::FRINT, MVT::f32, Legal); 831 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 832 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 833 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 834 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 835 setOperationAction(ISD::FRINT, MVT::f64, Legal); 836 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 837 838 // FIXME: Do we need to handle scalar-to-vector here? 839 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 840 841 // Can turn SHL into an integer multiply. 842 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 843 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 844 845 // i8 and i16 vectors are custom , because the source register and source 846 // source memory operand types are not the same width. f32 vectors are 847 // custom since the immediate controlling the insert encodes additional 848 // information. 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 850 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 851 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 852 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 853 854 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 855 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 856 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 858 859 if (Subtarget->is64Bit()) { 860 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 861 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 862 } 863 } 864 865 if (Subtarget->hasSSE42()) { 866 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 867 } 868 869 if (!UseSoftFloat && Subtarget->hasAVX()) { 870 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 871 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 872 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 873 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 874 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 875 876 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 877 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 878 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 879 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 880 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 881 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 882 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 883 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 884 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 885 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 886 setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 887 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 888 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 889 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 890 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 891 892 // Operations to consider commented out -v16i16 v32i8 893 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 894 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 895 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 896 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 897 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 898 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 899 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 900 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 901 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 902 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 903 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 904 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 905 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 906 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 907 908 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 909 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 910 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 911 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 912 913 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 914 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 915 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 916 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 917 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 918 919 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 920 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 921 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 922 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 923 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 924 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 925 926#if 0 927 // Not sure we want to do this since there are no 256-bit integer 928 // operations in AVX 929 930 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 931 // This includes 256-bit vectors 932 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 933 EVT VT = (MVT::SimpleValueType)i; 934 935 // Do not attempt to custom lower non-power-of-2 vectors 936 if (!isPowerOf2_32(VT.getVectorNumElements())) 937 continue; 938 939 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 940 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 942 } 943 944 if (Subtarget->is64Bit()) { 945 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 946 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 947 } 948#endif 949 950#if 0 951 // Not sure we want to do this since there are no 256-bit integer 952 // operations in AVX 953 954 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 955 // Including 256-bit vectors 956 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 957 EVT VT = (MVT::SimpleValueType)i; 958 959 if (!VT.is256BitVector()) { 960 continue; 961 } 962 setOperationAction(ISD::AND, VT, Promote); 963 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 964 setOperationAction(ISD::OR, VT, Promote); 965 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 966 setOperationAction(ISD::XOR, VT, Promote); 967 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 968 setOperationAction(ISD::LOAD, VT, Promote); 969 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 970 setOperationAction(ISD::SELECT, VT, Promote); 971 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 972 } 973 974 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 975#endif 976 } 977 978 // We want to custom lower some of our intrinsics. 979 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 980 981 // Add/Sub/Mul with overflow operations are custom lowered. 982 setOperationAction(ISD::SADDO, MVT::i32, Custom); 983 setOperationAction(ISD::UADDO, MVT::i32, Custom); 984 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 985 setOperationAction(ISD::USUBO, MVT::i32, Custom); 986 setOperationAction(ISD::SMULO, MVT::i32, Custom); 987 988 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 989 // handle type legalization for these operations here. 990 // 991 // FIXME: We really should do custom legalization for addition and 992 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 993 // than generic legalization for 64-bit multiplication-with-overflow, though. 994 if (Subtarget->is64Bit()) { 995 setOperationAction(ISD::SADDO, MVT::i64, Custom); 996 setOperationAction(ISD::UADDO, MVT::i64, Custom); 997 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 998 setOperationAction(ISD::USUBO, MVT::i64, Custom); 999 setOperationAction(ISD::SMULO, MVT::i64, Custom); 1000 } 1001 1002 if (!Subtarget->is64Bit()) { 1003 // These libcalls are not available in 32-bit. 1004 setLibcallName(RTLIB::SHL_I128, 0); 1005 setLibcallName(RTLIB::SRL_I128, 0); 1006 setLibcallName(RTLIB::SRA_I128, 0); 1007 } 1008 1009 // We have target-specific dag combine patterns for the following nodes: 1010 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1011 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1012 setTargetDAGCombine(ISD::BUILD_VECTOR); 1013 setTargetDAGCombine(ISD::SELECT); 1014 setTargetDAGCombine(ISD::SHL); 1015 setTargetDAGCombine(ISD::SRA); 1016 setTargetDAGCombine(ISD::SRL); 1017 setTargetDAGCombine(ISD::OR); 1018 setTargetDAGCombine(ISD::STORE); 1019 setTargetDAGCombine(ISD::ZERO_EXTEND); 1020 if (Subtarget->is64Bit()) 1021 setTargetDAGCombine(ISD::MUL); 1022 1023 computeRegisterProperties(); 1024 1025 // FIXME: These should be based on subtarget info. Plus, the values should 1026 // be smaller when we are in optimizing for size mode. 1027 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1028 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1029 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1030 setPrefLoopAlignment(16); 1031 benefitFromCodePlacementOpt = true; 1032} 1033 1034 1035MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1036 return MVT::i8; 1037} 1038 1039 1040/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1041/// the desired ByVal argument alignment. 1042static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1043 if (MaxAlign == 16) 1044 return; 1045 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1046 if (VTy->getBitWidth() == 128) 1047 MaxAlign = 16; 1048 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1049 unsigned EltAlign = 0; 1050 getMaxByValAlign(ATy->getElementType(), EltAlign); 1051 if (EltAlign > MaxAlign) 1052 MaxAlign = EltAlign; 1053 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1054 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1055 unsigned EltAlign = 0; 1056 getMaxByValAlign(STy->getElementType(i), EltAlign); 1057 if (EltAlign > MaxAlign) 1058 MaxAlign = EltAlign; 1059 if (MaxAlign == 16) 1060 break; 1061 } 1062 } 1063 return; 1064} 1065 1066/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1067/// function arguments in the caller parameter area. For X86, aggregates 1068/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1069/// are at 4-byte boundaries. 1070unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1071 if (Subtarget->is64Bit()) { 1072 // Max of 8 and alignment of type. 1073 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1074 if (TyAlign > 8) 1075 return TyAlign; 1076 return 8; 1077 } 1078 1079 unsigned Align = 4; 1080 if (Subtarget->hasSSE1()) 1081 getMaxByValAlign(Ty, Align); 1082 return Align; 1083} 1084 1085/// getOptimalMemOpType - Returns the target specific optimal type for load 1086/// and store operations as a result of memset, memcpy, and memmove 1087/// lowering. If DstAlign is zero that means it's safe to destination 1088/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1089/// means there isn't a need to check it against alignment requirement, 1090/// probably because the source does not need to be loaded. If 1091/// 'NonScalarIntSafe' is true, that means it's safe to return a 1092/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1093/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1094/// constant so it does not need to be loaded. 1095/// It returns EVT::Other if the type should be determined using generic 1096/// target-independent logic. 1097EVT 1098X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1099 unsigned DstAlign, unsigned SrcAlign, 1100 bool NonScalarIntSafe, 1101 bool MemcpyStrSrc, 1102 MachineFunction &MF) const { 1103 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1104 // linux. This is because the stack realignment code can't handle certain 1105 // cases like PR2962. This should be removed when PR2962 is fixed. 1106 const Function *F = MF.getFunction(); 1107 if (NonScalarIntSafe && 1108 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1109 if (Size >= 16 && 1110 (Subtarget->isUnalignedMemAccessFast() || 1111 ((DstAlign == 0 || DstAlign >= 16) && 1112 (SrcAlign == 0 || SrcAlign >= 16))) && 1113 Subtarget->getStackAlignment() >= 16) { 1114 if (Subtarget->hasSSE2()) 1115 return MVT::v4i32; 1116 if (Subtarget->hasSSE1()) 1117 return MVT::v4f32; 1118 } else if (!MemcpyStrSrc && Size >= 8 && 1119 !Subtarget->is64Bit() && 1120 Subtarget->getStackAlignment() >= 8 && 1121 Subtarget->hasSSE2()) { 1122 // Do not use f64 to lower memcpy if source is string constant. It's 1123 // better to use i32 to avoid the loads. 1124 return MVT::f64; 1125 } 1126 } 1127 if (Subtarget->is64Bit() && Size >= 8) 1128 return MVT::i64; 1129 return MVT::i32; 1130} 1131 1132/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1133/// current function. The returned value is a member of the 1134/// MachineJumpTableInfo::JTEntryKind enum. 1135unsigned X86TargetLowering::getJumpTableEncoding() const { 1136 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1137 // symbol. 1138 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1139 Subtarget->isPICStyleGOT()) 1140 return MachineJumpTableInfo::EK_Custom32; 1141 1142 // Otherwise, use the normal jump table encoding heuristics. 1143 return TargetLowering::getJumpTableEncoding(); 1144} 1145 1146/// getPICBaseSymbol - Return the X86-32 PIC base. 1147MCSymbol * 1148X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1149 MCContext &Ctx) const { 1150 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1151 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1152 Twine(MF->getFunctionNumber())+"$pb"); 1153} 1154 1155 1156const MCExpr * 1157X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1158 const MachineBasicBlock *MBB, 1159 unsigned uid,MCContext &Ctx) const{ 1160 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1161 Subtarget->isPICStyleGOT()); 1162 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1163 // entries. 1164 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1165 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1166} 1167 1168/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1169/// jumptable. 1170SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1171 SelectionDAG &DAG) const { 1172 if (!Subtarget->is64Bit()) 1173 // This doesn't have DebugLoc associated with it, but is not really the 1174 // same as a Register. 1175 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1176 return Table; 1177} 1178 1179/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1180/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1181/// MCExpr. 1182const MCExpr *X86TargetLowering:: 1183getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1184 MCContext &Ctx) const { 1185 // X86-64 uses RIP relative addressing based on the jump table label. 1186 if (Subtarget->isPICStyleRIPRel()) 1187 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1188 1189 // Otherwise, the reference is relative to the PIC base. 1190 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1191} 1192 1193/// getFunctionAlignment - Return the Log2 alignment of this function. 1194unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1195 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1196} 1197 1198std::pair<const TargetRegisterClass*, uint8_t> 1199X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1200 const TargetRegisterClass *RRC = 0; 1201 uint8_t Cost = 1; 1202 switch (VT.getSimpleVT().SimpleTy) { 1203 default: 1204 return TargetLowering::findRepresentativeClass(VT); 1205 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1206 RRC = (Subtarget->is64Bit() 1207 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1208 break; 1209 case MVT::v8i8: case MVT::v4i16: 1210 case MVT::v2i32: case MVT::v1i64: 1211 RRC = X86::VR64RegisterClass; 1212 break; 1213 case MVT::f32: case MVT::f64: 1214 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1215 case MVT::v4f32: case MVT::v2f64: 1216 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1217 case MVT::v4f64: 1218 RRC = X86::VR128RegisterClass; 1219 break; 1220 } 1221 return std::make_pair(RRC, Cost); 1222} 1223 1224unsigned 1225X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1226 MachineFunction &MF) const { 1227 unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; 1228 switch (RC->getID()) { 1229 default: 1230 return 0; 1231 case X86::GR32RegClassID: 1232 return 4 - FPDiff; 1233 case X86::GR64RegClassID: 1234 return 8 - FPDiff; 1235 case X86::VR128RegClassID: 1236 return Subtarget->is64Bit() ? 10 : 4; 1237 case X86::VR64RegClassID: 1238 return 4; 1239 } 1240} 1241 1242bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1243 unsigned &Offset) const { 1244 if (!Subtarget->isTargetLinux()) 1245 return false; 1246 1247 if (Subtarget->is64Bit()) { 1248 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1249 Offset = 0x28; 1250 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1251 AddressSpace = 256; 1252 else 1253 AddressSpace = 257; 1254 } else { 1255 // %gs:0x14 on i386 1256 Offset = 0x14; 1257 AddressSpace = 256; 1258 } 1259 return true; 1260} 1261 1262 1263//===----------------------------------------------------------------------===// 1264// Return Value Calling Convention Implementation 1265//===----------------------------------------------------------------------===// 1266 1267#include "X86GenCallingConv.inc" 1268 1269bool 1270X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1271 const SmallVectorImpl<ISD::OutputArg> &Outs, 1272 LLVMContext &Context) const { 1273 SmallVector<CCValAssign, 16> RVLocs; 1274 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1275 RVLocs, Context); 1276 return CCInfo.CheckReturn(Outs, RetCC_X86); 1277} 1278 1279SDValue 1280X86TargetLowering::LowerReturn(SDValue Chain, 1281 CallingConv::ID CallConv, bool isVarArg, 1282 const SmallVectorImpl<ISD::OutputArg> &Outs, 1283 const SmallVectorImpl<SDValue> &OutVals, 1284 DebugLoc dl, SelectionDAG &DAG) const { 1285 MachineFunction &MF = DAG.getMachineFunction(); 1286 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1287 1288 SmallVector<CCValAssign, 16> RVLocs; 1289 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1290 RVLocs, *DAG.getContext()); 1291 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1292 1293 // Add the regs to the liveout set for the function. 1294 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1295 for (unsigned i = 0; i != RVLocs.size(); ++i) 1296 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1297 MRI.addLiveOut(RVLocs[i].getLocReg()); 1298 1299 SDValue Flag; 1300 1301 SmallVector<SDValue, 6> RetOps; 1302 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1303 // Operand #1 = Bytes To Pop 1304 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1305 MVT::i16)); 1306 1307 // Copy the result values into the output registers. 1308 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1309 CCValAssign &VA = RVLocs[i]; 1310 assert(VA.isRegLoc() && "Can only return in registers!"); 1311 SDValue ValToCopy = OutVals[i]; 1312 EVT ValVT = ValToCopy.getValueType(); 1313 1314 // If this is x86-64, and we disabled SSE, we can't return FP values 1315 if ((ValVT == MVT::f32 || ValVT == MVT::f64) && 1316 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1317 report_fatal_error("SSE register return with SSE disabled"); 1318 } 1319 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1320 // llvm-gcc has never done it right and no one has noticed, so this 1321 // should be OK for now. 1322 if (ValVT == MVT::f64 && 1323 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) { 1324 report_fatal_error("SSE2 register return with SSE2 disabled"); 1325 } 1326 1327 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1328 // the RET instruction and handled by the FP Stackifier. 1329 if (VA.getLocReg() == X86::ST0 || 1330 VA.getLocReg() == X86::ST1) { 1331 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1332 // change the value to the FP stack register class. 1333 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1334 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1335 RetOps.push_back(ValToCopy); 1336 // Don't emit a copytoreg. 1337 continue; 1338 } 1339 1340 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1341 // which is returned in RAX / RDX. 1342 if (Subtarget->is64Bit()) { 1343 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1344 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1345 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1346 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1347 ValToCopy); 1348 } 1349 } 1350 1351 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1352 Flag = Chain.getValue(1); 1353 } 1354 1355 // The x86-64 ABI for returning structs by value requires that we copy 1356 // the sret argument into %rax for the return. We saved the argument into 1357 // a virtual register in the entry block, so now we copy the value out 1358 // and into %rax. 1359 if (Subtarget->is64Bit() && 1360 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1361 MachineFunction &MF = DAG.getMachineFunction(); 1362 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1363 unsigned Reg = FuncInfo->getSRetReturnReg(); 1364 assert(Reg && 1365 "SRetReturnReg should have been set in LowerFormalArguments()."); 1366 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1367 1368 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1369 Flag = Chain.getValue(1); 1370 1371 // RAX now acts like a return value. 1372 MRI.addLiveOut(X86::RAX); 1373 } 1374 1375 RetOps[0] = Chain; // Update chain. 1376 1377 // Add the flag if we have it. 1378 if (Flag.getNode()) 1379 RetOps.push_back(Flag); 1380 1381 return DAG.getNode(X86ISD::RET_FLAG, dl, 1382 MVT::Other, &RetOps[0], RetOps.size()); 1383} 1384 1385/// LowerCallResult - Lower the result values of a call into the 1386/// appropriate copies out of appropriate physical registers. 1387/// 1388SDValue 1389X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1390 CallingConv::ID CallConv, bool isVarArg, 1391 const SmallVectorImpl<ISD::InputArg> &Ins, 1392 DebugLoc dl, SelectionDAG &DAG, 1393 SmallVectorImpl<SDValue> &InVals) const { 1394 1395 // Assign locations to each value returned by this call. 1396 SmallVector<CCValAssign, 16> RVLocs; 1397 bool Is64Bit = Subtarget->is64Bit(); 1398 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1399 RVLocs, *DAG.getContext()); 1400 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1401 1402 // Copy all of the result registers out of their specified physreg. 1403 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1404 CCValAssign &VA = RVLocs[i]; 1405 EVT CopyVT = VA.getValVT(); 1406 1407 // If this is x86-64, and we disabled SSE, we can't return FP values 1408 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1409 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1410 report_fatal_error("SSE register return with SSE disabled"); 1411 } 1412 1413 SDValue Val; 1414 1415 // If this is a call to a function that returns an fp value on the floating 1416 // point stack, we must guarantee the the value is popped from the stack, so 1417 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1418 // if the return value is not used. We use the FpGET_ST0 instructions 1419 // instead. 1420 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1421 // If we prefer to use the value in xmm registers, copy it out as f80 and 1422 // use a truncate to move it from fp stack reg to xmm reg. 1423 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1424 bool isST0 = VA.getLocReg() == X86::ST0; 1425 unsigned Opc = 0; 1426 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1427 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1428 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1429 SDValue Ops[] = { Chain, InFlag }; 1430 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1431 Ops, 2), 1); 1432 Val = Chain.getValue(0); 1433 1434 // Round the f80 to the right size, which also moves it to the appropriate 1435 // xmm register. 1436 if (CopyVT != VA.getValVT()) 1437 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1438 // This truncation won't change the value. 1439 DAG.getIntPtrConstant(1)); 1440 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1441 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1442 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1443 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1444 MVT::v2i64, InFlag).getValue(1); 1445 Val = Chain.getValue(0); 1446 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1447 Val, DAG.getConstant(0, MVT::i64)); 1448 } else { 1449 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1450 MVT::i64, InFlag).getValue(1); 1451 Val = Chain.getValue(0); 1452 } 1453 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1454 } else { 1455 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1456 CopyVT, InFlag).getValue(1); 1457 Val = Chain.getValue(0); 1458 } 1459 InFlag = Chain.getValue(2); 1460 InVals.push_back(Val); 1461 } 1462 1463 return Chain; 1464} 1465 1466 1467//===----------------------------------------------------------------------===// 1468// C & StdCall & Fast Calling Convention implementation 1469//===----------------------------------------------------------------------===// 1470// StdCall calling convention seems to be standard for many Windows' API 1471// routines and around. It differs from C calling convention just a little: 1472// callee should clean up the stack, not caller. Symbols should be also 1473// decorated in some fancy way :) It doesn't support any vector arguments. 1474// For info on fast calling convention see Fast Calling Convention (tail call) 1475// implementation LowerX86_32FastCCCallTo. 1476 1477/// CallIsStructReturn - Determines whether a call uses struct return 1478/// semantics. 1479static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1480 if (Outs.empty()) 1481 return false; 1482 1483 return Outs[0].Flags.isSRet(); 1484} 1485 1486/// ArgsAreStructReturn - Determines whether a function uses struct 1487/// return semantics. 1488static bool 1489ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1490 if (Ins.empty()) 1491 return false; 1492 1493 return Ins[0].Flags.isSRet(); 1494} 1495 1496/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1497/// given CallingConvention value. 1498CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1499 if (Subtarget->is64Bit()) { 1500 if (CC == CallingConv::GHC) 1501 return CC_X86_64_GHC; 1502 else if (Subtarget->isTargetWin64()) 1503 return CC_X86_Win64_C; 1504 else 1505 return CC_X86_64_C; 1506 } 1507 1508 if (CC == CallingConv::X86_FastCall) 1509 return CC_X86_32_FastCall; 1510 else if (CC == CallingConv::X86_ThisCall) 1511 return CC_X86_32_ThisCall; 1512 else if (CC == CallingConv::Fast) 1513 return CC_X86_32_FastCC; 1514 else if (CC == CallingConv::GHC) 1515 return CC_X86_32_GHC; 1516 else 1517 return CC_X86_32_C; 1518} 1519 1520/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1521/// by "Src" to address "Dst" with size and alignment information specified by 1522/// the specific parameter attribute. The copy will be passed as a byval 1523/// function parameter. 1524static SDValue 1525CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1526 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1527 DebugLoc dl) { 1528 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1529 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1530 /*isVolatile*/false, /*AlwaysInline=*/true, 1531 NULL, 0, NULL, 0); 1532} 1533 1534/// IsTailCallConvention - Return true if the calling convention is one that 1535/// supports tail call optimization. 1536static bool IsTailCallConvention(CallingConv::ID CC) { 1537 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1538} 1539 1540/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1541/// a tailcall target by changing its ABI. 1542static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1543 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1544} 1545 1546SDValue 1547X86TargetLowering::LowerMemArgument(SDValue Chain, 1548 CallingConv::ID CallConv, 1549 const SmallVectorImpl<ISD::InputArg> &Ins, 1550 DebugLoc dl, SelectionDAG &DAG, 1551 const CCValAssign &VA, 1552 MachineFrameInfo *MFI, 1553 unsigned i) const { 1554 // Create the nodes corresponding to a load from this parameter slot. 1555 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1556 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1557 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1558 EVT ValVT; 1559 1560 // If value is passed by pointer we have address passed instead of the value 1561 // itself. 1562 if (VA.getLocInfo() == CCValAssign::Indirect) 1563 ValVT = VA.getLocVT(); 1564 else 1565 ValVT = VA.getValVT(); 1566 1567 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1568 // changed with more analysis. 1569 // In case of tail call optimization mark all arguments mutable. Since they 1570 // could be overwritten by lowering of arguments in case of a tail call. 1571 if (Flags.isByVal()) { 1572 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1573 VA.getLocMemOffset(), isImmutable); 1574 return DAG.getFrameIndex(FI, getPointerTy()); 1575 } else { 1576 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1577 VA.getLocMemOffset(), isImmutable); 1578 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1579 return DAG.getLoad(ValVT, dl, Chain, FIN, 1580 PseudoSourceValue::getFixedStack(FI), 0, 1581 false, false, 0); 1582 } 1583} 1584 1585SDValue 1586X86TargetLowering::LowerFormalArguments(SDValue Chain, 1587 CallingConv::ID CallConv, 1588 bool isVarArg, 1589 const SmallVectorImpl<ISD::InputArg> &Ins, 1590 DebugLoc dl, 1591 SelectionDAG &DAG, 1592 SmallVectorImpl<SDValue> &InVals) 1593 const { 1594 MachineFunction &MF = DAG.getMachineFunction(); 1595 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1596 1597 const Function* Fn = MF.getFunction(); 1598 if (Fn->hasExternalLinkage() && 1599 Subtarget->isTargetCygMing() && 1600 Fn->getName() == "main") 1601 FuncInfo->setForceFramePointer(true); 1602 1603 MachineFrameInfo *MFI = MF.getFrameInfo(); 1604 bool Is64Bit = Subtarget->is64Bit(); 1605 bool IsWin64 = Subtarget->isTargetWin64(); 1606 1607 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1608 "Var args not supported with calling convention fastcc or ghc"); 1609 1610 // Assign locations to all of the incoming arguments. 1611 SmallVector<CCValAssign, 16> ArgLocs; 1612 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1613 ArgLocs, *DAG.getContext()); 1614 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1615 1616 unsigned LastVal = ~0U; 1617 SDValue ArgValue; 1618 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1619 CCValAssign &VA = ArgLocs[i]; 1620 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1621 // places. 1622 assert(VA.getValNo() != LastVal && 1623 "Don't support value assigned to multiple locs yet"); 1624 LastVal = VA.getValNo(); 1625 1626 if (VA.isRegLoc()) { 1627 EVT RegVT = VA.getLocVT(); 1628 TargetRegisterClass *RC = NULL; 1629 if (RegVT == MVT::i32) 1630 RC = X86::GR32RegisterClass; 1631 else if (Is64Bit && RegVT == MVT::i64) 1632 RC = X86::GR64RegisterClass; 1633 else if (RegVT == MVT::f32) 1634 RC = X86::FR32RegisterClass; 1635 else if (RegVT == MVT::f64) 1636 RC = X86::FR64RegisterClass; 1637 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1638 RC = X86::VR256RegisterClass; 1639 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1640 RC = X86::VR128RegisterClass; 1641 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1642 RC = X86::VR64RegisterClass; 1643 else 1644 llvm_unreachable("Unknown argument type!"); 1645 1646 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1647 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1648 1649 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1650 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1651 // right size. 1652 if (VA.getLocInfo() == CCValAssign::SExt) 1653 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1654 DAG.getValueType(VA.getValVT())); 1655 else if (VA.getLocInfo() == CCValAssign::ZExt) 1656 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1657 DAG.getValueType(VA.getValVT())); 1658 else if (VA.getLocInfo() == CCValAssign::BCvt) 1659 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1660 1661 if (VA.isExtInLoc()) { 1662 // Handle MMX values passed in XMM regs. 1663 if (RegVT.isVector()) { 1664 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1665 ArgValue, DAG.getConstant(0, MVT::i64)); 1666 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1667 } else 1668 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1669 } 1670 } else { 1671 assert(VA.isMemLoc()); 1672 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1673 } 1674 1675 // If value is passed via pointer - do a load. 1676 if (VA.getLocInfo() == CCValAssign::Indirect) 1677 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1678 false, false, 0); 1679 1680 InVals.push_back(ArgValue); 1681 } 1682 1683 // The x86-64 ABI for returning structs by value requires that we copy 1684 // the sret argument into %rax for the return. Save the argument into 1685 // a virtual register so that we can access it from the return points. 1686 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1687 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1688 unsigned Reg = FuncInfo->getSRetReturnReg(); 1689 if (!Reg) { 1690 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1691 FuncInfo->setSRetReturnReg(Reg); 1692 } 1693 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1694 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1695 } 1696 1697 unsigned StackSize = CCInfo.getNextStackOffset(); 1698 // Align stack specially for tail calls. 1699 if (FuncIsMadeTailCallSafe(CallConv)) 1700 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1701 1702 // If the function takes variable number of arguments, make a frame index for 1703 // the start of the first vararg value... for expansion of llvm.va_start. 1704 if (isVarArg) { 1705 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1706 CallConv != CallingConv::X86_ThisCall)) { 1707 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1708 } 1709 if (Is64Bit) { 1710 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1711 1712 // FIXME: We should really autogenerate these arrays 1713 static const unsigned GPR64ArgRegsWin64[] = { 1714 X86::RCX, X86::RDX, X86::R8, X86::R9 1715 }; 1716 static const unsigned XMMArgRegsWin64[] = { 1717 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1718 }; 1719 static const unsigned GPR64ArgRegs64Bit[] = { 1720 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1721 }; 1722 static const unsigned XMMArgRegs64Bit[] = { 1723 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1724 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1725 }; 1726 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1727 1728 if (IsWin64) { 1729 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1730 GPR64ArgRegs = GPR64ArgRegsWin64; 1731 XMMArgRegs = XMMArgRegsWin64; 1732 } else { 1733 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1734 GPR64ArgRegs = GPR64ArgRegs64Bit; 1735 XMMArgRegs = XMMArgRegs64Bit; 1736 } 1737 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1738 TotalNumIntRegs); 1739 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1740 TotalNumXMMRegs); 1741 1742 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1743 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1744 "SSE register cannot be used when SSE is disabled!"); 1745 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1746 "SSE register cannot be used when SSE is disabled!"); 1747 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1748 // Kernel mode asks for SSE to be disabled, so don't push them 1749 // on the stack. 1750 TotalNumXMMRegs = 0; 1751 1752 // For X86-64, if there are vararg parameters that are passed via 1753 // registers, then we must store them to their spots on the stack so they 1754 // may be loaded by deferencing the result of va_next. 1755 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1756 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1757 FuncInfo->setRegSaveFrameIndex( 1758 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1759 false)); 1760 1761 // Store the integer parameter registers. 1762 SmallVector<SDValue, 8> MemOps; 1763 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1764 getPointerTy()); 1765 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1766 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1767 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1768 DAG.getIntPtrConstant(Offset)); 1769 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1770 X86::GR64RegisterClass); 1771 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1772 SDValue Store = 1773 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1774 PseudoSourceValue::getFixedStack( 1775 FuncInfo->getRegSaveFrameIndex()), 1776 Offset, false, false, 0); 1777 MemOps.push_back(Store); 1778 Offset += 8; 1779 } 1780 1781 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1782 // Now store the XMM (fp + vector) parameter registers. 1783 SmallVector<SDValue, 11> SaveXMMOps; 1784 SaveXMMOps.push_back(Chain); 1785 1786 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1787 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1788 SaveXMMOps.push_back(ALVal); 1789 1790 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1791 FuncInfo->getRegSaveFrameIndex())); 1792 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1793 FuncInfo->getVarArgsFPOffset())); 1794 1795 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1796 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1797 X86::VR128RegisterClass); 1798 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1799 SaveXMMOps.push_back(Val); 1800 } 1801 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1802 MVT::Other, 1803 &SaveXMMOps[0], SaveXMMOps.size())); 1804 } 1805 1806 if (!MemOps.empty()) 1807 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1808 &MemOps[0], MemOps.size()); 1809 } 1810 } 1811 1812 // Some CCs need callee pop. 1813 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1814 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1815 } else { 1816 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1817 // If this is an sret function, the return should pop the hidden pointer. 1818 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1819 FuncInfo->setBytesToPopOnReturn(4); 1820 } 1821 1822 if (!Is64Bit) { 1823 // RegSaveFrameIndex is X86-64 only. 1824 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1825 if (CallConv == CallingConv::X86_FastCall || 1826 CallConv == CallingConv::X86_ThisCall) 1827 // fastcc functions can't have varargs. 1828 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1829 } 1830 1831 return Chain; 1832} 1833 1834SDValue 1835X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1836 SDValue StackPtr, SDValue Arg, 1837 DebugLoc dl, SelectionDAG &DAG, 1838 const CCValAssign &VA, 1839 ISD::ArgFlagsTy Flags) const { 1840 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1841 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1842 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1843 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1844 if (Flags.isByVal()) { 1845 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1846 } 1847 return DAG.getStore(Chain, dl, Arg, PtrOff, 1848 PseudoSourceValue::getStack(), LocMemOffset, 1849 false, false, 0); 1850} 1851 1852/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1853/// optimization is performed and it is required. 1854SDValue 1855X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1856 SDValue &OutRetAddr, SDValue Chain, 1857 bool IsTailCall, bool Is64Bit, 1858 int FPDiff, DebugLoc dl) const { 1859 // Adjust the Return address stack slot. 1860 EVT VT = getPointerTy(); 1861 OutRetAddr = getReturnAddressFrameIndex(DAG); 1862 1863 // Load the "old" Return address. 1864 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1865 return SDValue(OutRetAddr.getNode(), 1); 1866} 1867 1868/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1869/// optimization is performed and it is required (FPDiff!=0). 1870static SDValue 1871EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1872 SDValue Chain, SDValue RetAddrFrIdx, 1873 bool Is64Bit, int FPDiff, DebugLoc dl) { 1874 // Store the return address to the appropriate stack slot. 1875 if (!FPDiff) return Chain; 1876 // Calculate the new stack slot for the return address. 1877 int SlotSize = Is64Bit ? 8 : 4; 1878 int NewReturnAddrFI = 1879 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1880 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1881 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1882 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1883 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1884 false, false, 0); 1885 return Chain; 1886} 1887 1888SDValue 1889X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1890 CallingConv::ID CallConv, bool isVarArg, 1891 bool &isTailCall, 1892 const SmallVectorImpl<ISD::OutputArg> &Outs, 1893 const SmallVectorImpl<SDValue> &OutVals, 1894 const SmallVectorImpl<ISD::InputArg> &Ins, 1895 DebugLoc dl, SelectionDAG &DAG, 1896 SmallVectorImpl<SDValue> &InVals) const { 1897 MachineFunction &MF = DAG.getMachineFunction(); 1898 bool Is64Bit = Subtarget->is64Bit(); 1899 bool IsStructRet = CallIsStructReturn(Outs); 1900 bool IsSibcall = false; 1901 1902 if (isTailCall) { 1903 // Check if it's really possible to do a tail call. 1904 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1905 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1906 Outs, OutVals, Ins, DAG); 1907 1908 // Sibcalls are automatically detected tailcalls which do not require 1909 // ABI changes. 1910 if (!GuaranteedTailCallOpt && isTailCall) 1911 IsSibcall = true; 1912 1913 if (isTailCall) 1914 ++NumTailCalls; 1915 } 1916 1917 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1918 "Var args not supported with calling convention fastcc or ghc"); 1919 1920 // Analyze operands of the call, assigning locations to each operand. 1921 SmallVector<CCValAssign, 16> ArgLocs; 1922 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1923 ArgLocs, *DAG.getContext()); 1924 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1925 1926 // Get a count of how many bytes are to be pushed on the stack. 1927 unsigned NumBytes = CCInfo.getNextStackOffset(); 1928 if (IsSibcall) 1929 // This is a sibcall. The memory operands are available in caller's 1930 // own caller's stack. 1931 NumBytes = 0; 1932 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1933 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1934 1935 int FPDiff = 0; 1936 if (isTailCall && !IsSibcall) { 1937 // Lower arguments at fp - stackoffset + fpdiff. 1938 unsigned NumBytesCallerPushed = 1939 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1940 FPDiff = NumBytesCallerPushed - NumBytes; 1941 1942 // Set the delta of movement of the returnaddr stackslot. 1943 // But only set if delta is greater than previous delta. 1944 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1945 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1946 } 1947 1948 if (!IsSibcall) 1949 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1950 1951 SDValue RetAddrFrIdx; 1952 // Load return adress for tail calls. 1953 if (isTailCall && FPDiff) 1954 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1955 Is64Bit, FPDiff, dl); 1956 1957 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1958 SmallVector<SDValue, 8> MemOpChains; 1959 SDValue StackPtr; 1960 1961 // Walk the register/memloc assignments, inserting copies/loads. In the case 1962 // of tail call optimization arguments are handle later. 1963 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1964 CCValAssign &VA = ArgLocs[i]; 1965 EVT RegVT = VA.getLocVT(); 1966 SDValue Arg = OutVals[i]; 1967 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1968 bool isByVal = Flags.isByVal(); 1969 1970 // Promote the value if needed. 1971 switch (VA.getLocInfo()) { 1972 default: llvm_unreachable("Unknown loc info!"); 1973 case CCValAssign::Full: break; 1974 case CCValAssign::SExt: 1975 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1976 break; 1977 case CCValAssign::ZExt: 1978 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1979 break; 1980 case CCValAssign::AExt: 1981 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1982 // Special case: passing MMX values in XMM registers. 1983 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1984 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1985 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1986 } else 1987 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1988 break; 1989 case CCValAssign::BCvt: 1990 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1991 break; 1992 case CCValAssign::Indirect: { 1993 // Store the argument. 1994 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1995 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1996 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1997 PseudoSourceValue::getFixedStack(FI), 0, 1998 false, false, 0); 1999 Arg = SpillSlot; 2000 break; 2001 } 2002 } 2003 2004 if (VA.isRegLoc()) { 2005 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2006 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2007 assert(VA.isMemLoc()); 2008 if (StackPtr.getNode() == 0) 2009 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2010 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2011 dl, DAG, VA, Flags)); 2012 } 2013 } 2014 2015 if (!MemOpChains.empty()) 2016 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2017 &MemOpChains[0], MemOpChains.size()); 2018 2019 // Build a sequence of copy-to-reg nodes chained together with token chain 2020 // and flag operands which copy the outgoing args into registers. 2021 SDValue InFlag; 2022 // Tail call byval lowering might overwrite argument registers so in case of 2023 // tail call optimization the copies to registers are lowered later. 2024 if (!isTailCall) 2025 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2026 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2027 RegsToPass[i].second, InFlag); 2028 InFlag = Chain.getValue(1); 2029 } 2030 2031 if (Subtarget->isPICStyleGOT()) { 2032 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2033 // GOT pointer. 2034 if (!isTailCall) { 2035 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2036 DAG.getNode(X86ISD::GlobalBaseReg, 2037 DebugLoc(), getPointerTy()), 2038 InFlag); 2039 InFlag = Chain.getValue(1); 2040 } else { 2041 // If we are tail calling and generating PIC/GOT style code load the 2042 // address of the callee into ECX. The value in ecx is used as target of 2043 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2044 // for tail calls on PIC/GOT architectures. Normally we would just put the 2045 // address of GOT into ebx and then call target@PLT. But for tail calls 2046 // ebx would be restored (since ebx is callee saved) before jumping to the 2047 // target@PLT. 2048 2049 // Note: The actual moving to ECX is done further down. 2050 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2051 if (G && !G->getGlobal()->hasHiddenVisibility() && 2052 !G->getGlobal()->hasProtectedVisibility()) 2053 Callee = LowerGlobalAddress(Callee, DAG); 2054 else if (isa<ExternalSymbolSDNode>(Callee)) 2055 Callee = LowerExternalSymbol(Callee, DAG); 2056 } 2057 } 2058 2059 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2060 // From AMD64 ABI document: 2061 // For calls that may call functions that use varargs or stdargs 2062 // (prototype-less calls or calls to functions containing ellipsis (...) in 2063 // the declaration) %al is used as hidden argument to specify the number 2064 // of SSE registers used. The contents of %al do not need to match exactly 2065 // the number of registers, but must be an ubound on the number of SSE 2066 // registers used and is in the range 0 - 8 inclusive. 2067 2068 // Count the number of XMM registers allocated. 2069 static const unsigned XMMArgRegs[] = { 2070 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2071 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2072 }; 2073 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2074 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2075 && "SSE registers cannot be used when SSE is disabled"); 2076 2077 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2078 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2079 InFlag = Chain.getValue(1); 2080 } 2081 2082 2083 // For tail calls lower the arguments to the 'real' stack slot. 2084 if (isTailCall) { 2085 // Force all the incoming stack arguments to be loaded from the stack 2086 // before any new outgoing arguments are stored to the stack, because the 2087 // outgoing stack slots may alias the incoming argument stack slots, and 2088 // the alias isn't otherwise explicit. This is slightly more conservative 2089 // than necessary, because it means that each store effectively depends 2090 // on every argument instead of just those arguments it would clobber. 2091 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2092 2093 SmallVector<SDValue, 8> MemOpChains2; 2094 SDValue FIN; 2095 int FI = 0; 2096 // Do not flag preceeding copytoreg stuff together with the following stuff. 2097 InFlag = SDValue(); 2098 if (GuaranteedTailCallOpt) { 2099 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2100 CCValAssign &VA = ArgLocs[i]; 2101 if (VA.isRegLoc()) 2102 continue; 2103 assert(VA.isMemLoc()); 2104 SDValue Arg = OutVals[i]; 2105 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2106 // Create frame index. 2107 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2108 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2109 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2110 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2111 2112 if (Flags.isByVal()) { 2113 // Copy relative to framepointer. 2114 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2115 if (StackPtr.getNode() == 0) 2116 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2117 getPointerTy()); 2118 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2119 2120 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2121 ArgChain, 2122 Flags, DAG, dl)); 2123 } else { 2124 // Store relative to framepointer. 2125 MemOpChains2.push_back( 2126 DAG.getStore(ArgChain, dl, Arg, FIN, 2127 PseudoSourceValue::getFixedStack(FI), 0, 2128 false, false, 0)); 2129 } 2130 } 2131 } 2132 2133 if (!MemOpChains2.empty()) 2134 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2135 &MemOpChains2[0], MemOpChains2.size()); 2136 2137 // Copy arguments to their registers. 2138 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2139 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2140 RegsToPass[i].second, InFlag); 2141 InFlag = Chain.getValue(1); 2142 } 2143 InFlag =SDValue(); 2144 2145 // Store the return address to the appropriate stack slot. 2146 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2147 FPDiff, dl); 2148 } 2149 2150 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2151 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2152 // In the 64-bit large code model, we have to make all calls 2153 // through a register, since the call instruction's 32-bit 2154 // pc-relative offset may not be large enough to hold the whole 2155 // address. 2156 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2157 // If the callee is a GlobalAddress node (quite common, every direct call 2158 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2159 // it. 2160 2161 // We should use extra load for direct calls to dllimported functions in 2162 // non-JIT mode. 2163 const GlobalValue *GV = G->getGlobal(); 2164 if (!GV->hasDLLImportLinkage()) { 2165 unsigned char OpFlags = 0; 2166 2167 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2168 // external symbols most go through the PLT in PIC mode. If the symbol 2169 // has hidden or protected visibility, or if it is static or local, then 2170 // we don't need to use the PLT - we can directly call it. 2171 if (Subtarget->isTargetELF() && 2172 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2173 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2174 OpFlags = X86II::MO_PLT; 2175 } else if (Subtarget->isPICStyleStubAny() && 2176 (GV->isDeclaration() || GV->isWeakForLinker()) && 2177 Subtarget->getDarwinVers() < 9) { 2178 // PC-relative references to external symbols should go through $stub, 2179 // unless we're building with the leopard linker or later, which 2180 // automatically synthesizes these stubs. 2181 OpFlags = X86II::MO_DARWIN_STUB; 2182 } 2183 2184 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2185 G->getOffset(), OpFlags); 2186 } 2187 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2188 unsigned char OpFlags = 0; 2189 2190 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2191 // symbols should go through the PLT. 2192 if (Subtarget->isTargetELF() && 2193 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2194 OpFlags = X86II::MO_PLT; 2195 } else if (Subtarget->isPICStyleStubAny() && 2196 Subtarget->getDarwinVers() < 9) { 2197 // PC-relative references to external symbols should go through $stub, 2198 // unless we're building with the leopard linker or later, which 2199 // automatically synthesizes these stubs. 2200 OpFlags = X86II::MO_DARWIN_STUB; 2201 } 2202 2203 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2204 OpFlags); 2205 } 2206 2207 // Returns a chain & a flag for retval copy to use. 2208 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2209 SmallVector<SDValue, 8> Ops; 2210 2211 if (!IsSibcall && isTailCall) { 2212 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2213 DAG.getIntPtrConstant(0, true), InFlag); 2214 InFlag = Chain.getValue(1); 2215 } 2216 2217 Ops.push_back(Chain); 2218 Ops.push_back(Callee); 2219 2220 if (isTailCall) 2221 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2222 2223 // Add argument registers to the end of the list so that they are known live 2224 // into the call. 2225 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2226 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2227 RegsToPass[i].second.getValueType())); 2228 2229 // Add an implicit use GOT pointer in EBX. 2230 if (!isTailCall && Subtarget->isPICStyleGOT()) 2231 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2232 2233 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2234 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2235 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2236 2237 if (InFlag.getNode()) 2238 Ops.push_back(InFlag); 2239 2240 if (isTailCall) { 2241 // We used to do: 2242 //// If this is the first return lowered for this function, add the regs 2243 //// to the liveout set for the function. 2244 // This isn't right, although it's probably harmless on x86; liveouts 2245 // should be computed from returns not tail calls. Consider a void 2246 // function making a tail call to a function returning int. 2247 return DAG.getNode(X86ISD::TC_RETURN, dl, 2248 NodeTys, &Ops[0], Ops.size()); 2249 } 2250 2251 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2252 InFlag = Chain.getValue(1); 2253 2254 // Create the CALLSEQ_END node. 2255 unsigned NumBytesForCalleeToPush; 2256 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2257 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2258 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2259 // If this is a call to a struct-return function, the callee 2260 // pops the hidden struct pointer, so we have to push it back. 2261 // This is common for Darwin/X86, Linux & Mingw32 targets. 2262 NumBytesForCalleeToPush = 4; 2263 else 2264 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2265 2266 // Returns a flag for retval copy to use. 2267 if (!IsSibcall) { 2268 Chain = DAG.getCALLSEQ_END(Chain, 2269 DAG.getIntPtrConstant(NumBytes, true), 2270 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2271 true), 2272 InFlag); 2273 InFlag = Chain.getValue(1); 2274 } 2275 2276 // Handle result values, copying them out of physregs into vregs that we 2277 // return. 2278 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2279 Ins, dl, DAG, InVals); 2280} 2281 2282 2283//===----------------------------------------------------------------------===// 2284// Fast Calling Convention (tail call) implementation 2285//===----------------------------------------------------------------------===// 2286 2287// Like std call, callee cleans arguments, convention except that ECX is 2288// reserved for storing the tail called function address. Only 2 registers are 2289// free for argument passing (inreg). Tail call optimization is performed 2290// provided: 2291// * tailcallopt is enabled 2292// * caller/callee are fastcc 2293// On X86_64 architecture with GOT-style position independent code only local 2294// (within module) calls are supported at the moment. 2295// To keep the stack aligned according to platform abi the function 2296// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2297// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2298// If a tail called function callee has more arguments than the caller the 2299// caller needs to make sure that there is room to move the RETADDR to. This is 2300// achieved by reserving an area the size of the argument delta right after the 2301// original REtADDR, but before the saved framepointer or the spilled registers 2302// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2303// stack layout: 2304// arg1 2305// arg2 2306// RETADDR 2307// [ new RETADDR 2308// move area ] 2309// (possible EBP) 2310// ESI 2311// EDI 2312// local1 .. 2313 2314/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2315/// for a 16 byte align requirement. 2316unsigned 2317X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2318 SelectionDAG& DAG) const { 2319 MachineFunction &MF = DAG.getMachineFunction(); 2320 const TargetMachine &TM = MF.getTarget(); 2321 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2322 unsigned StackAlignment = TFI.getStackAlignment(); 2323 uint64_t AlignMask = StackAlignment - 1; 2324 int64_t Offset = StackSize; 2325 uint64_t SlotSize = TD->getPointerSize(); 2326 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2327 // Number smaller than 12 so just add the difference. 2328 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2329 } else { 2330 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2331 Offset = ((~AlignMask) & Offset) + StackAlignment + 2332 (StackAlignment-SlotSize); 2333 } 2334 return Offset; 2335} 2336 2337/// MatchingStackOffset - Return true if the given stack call argument is 2338/// already available in the same position (relatively) of the caller's 2339/// incoming argument stack. 2340static 2341bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2342 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2343 const X86InstrInfo *TII) { 2344 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2345 int FI = INT_MAX; 2346 if (Arg.getOpcode() == ISD::CopyFromReg) { 2347 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2348 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2349 return false; 2350 MachineInstr *Def = MRI->getVRegDef(VR); 2351 if (!Def) 2352 return false; 2353 if (!Flags.isByVal()) { 2354 if (!TII->isLoadFromStackSlot(Def, FI)) 2355 return false; 2356 } else { 2357 unsigned Opcode = Def->getOpcode(); 2358 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2359 Def->getOperand(1).isFI()) { 2360 FI = Def->getOperand(1).getIndex(); 2361 Bytes = Flags.getByValSize(); 2362 } else 2363 return false; 2364 } 2365 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2366 if (Flags.isByVal()) 2367 // ByVal argument is passed in as a pointer but it's now being 2368 // dereferenced. e.g. 2369 // define @foo(%struct.X* %A) { 2370 // tail call @bar(%struct.X* byval %A) 2371 // } 2372 return false; 2373 SDValue Ptr = Ld->getBasePtr(); 2374 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2375 if (!FINode) 2376 return false; 2377 FI = FINode->getIndex(); 2378 } else 2379 return false; 2380 2381 assert(FI != INT_MAX); 2382 if (!MFI->isFixedObjectIndex(FI)) 2383 return false; 2384 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2385} 2386 2387/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2388/// for tail call optimization. Targets which want to do tail call 2389/// optimization should implement this function. 2390bool 2391X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2392 CallingConv::ID CalleeCC, 2393 bool isVarArg, 2394 bool isCalleeStructRet, 2395 bool isCallerStructRet, 2396 const SmallVectorImpl<ISD::OutputArg> &Outs, 2397 const SmallVectorImpl<SDValue> &OutVals, 2398 const SmallVectorImpl<ISD::InputArg> &Ins, 2399 SelectionDAG& DAG) const { 2400 if (!IsTailCallConvention(CalleeCC) && 2401 CalleeCC != CallingConv::C) 2402 return false; 2403 2404 // If -tailcallopt is specified, make fastcc functions tail-callable. 2405 const MachineFunction &MF = DAG.getMachineFunction(); 2406 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2407 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2408 bool CCMatch = CallerCC == CalleeCC; 2409 2410 if (GuaranteedTailCallOpt) { 2411 if (IsTailCallConvention(CalleeCC) && CCMatch) 2412 return true; 2413 return false; 2414 } 2415 2416 // Look for obvious safe cases to perform tail call optimization that do not 2417 // require ABI changes. This is what gcc calls sibcall. 2418 2419 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2420 // emit a special epilogue. 2421 if (RegInfo->needsStackRealignment(MF)) 2422 return false; 2423 2424 // Do not sibcall optimize vararg calls unless the call site is not passing 2425 // any arguments. 2426 if (isVarArg && !Outs.empty()) 2427 return false; 2428 2429 // Also avoid sibcall optimization if either caller or callee uses struct 2430 // return semantics. 2431 if (isCalleeStructRet || isCallerStructRet) 2432 return false; 2433 2434 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2435 // Therefore if it's not used by the call it is not safe to optimize this into 2436 // a sibcall. 2437 bool Unused = false; 2438 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2439 if (!Ins[i].Used) { 2440 Unused = true; 2441 break; 2442 } 2443 } 2444 if (Unused) { 2445 SmallVector<CCValAssign, 16> RVLocs; 2446 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2447 RVLocs, *DAG.getContext()); 2448 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2449 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2450 CCValAssign &VA = RVLocs[i]; 2451 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2452 return false; 2453 } 2454 } 2455 2456 // If the calling conventions do not match, then we'd better make sure the 2457 // results are returned in the same way as what the caller expects. 2458 if (!CCMatch) { 2459 SmallVector<CCValAssign, 16> RVLocs1; 2460 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2461 RVLocs1, *DAG.getContext()); 2462 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2463 2464 SmallVector<CCValAssign, 16> RVLocs2; 2465 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2466 RVLocs2, *DAG.getContext()); 2467 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2468 2469 if (RVLocs1.size() != RVLocs2.size()) 2470 return false; 2471 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2472 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2473 return false; 2474 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2475 return false; 2476 if (RVLocs1[i].isRegLoc()) { 2477 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2478 return false; 2479 } else { 2480 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2481 return false; 2482 } 2483 } 2484 } 2485 2486 // If the callee takes no arguments then go on to check the results of the 2487 // call. 2488 if (!Outs.empty()) { 2489 // Check if stack adjustment is needed. For now, do not do this if any 2490 // argument is passed on the stack. 2491 SmallVector<CCValAssign, 16> ArgLocs; 2492 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2493 ArgLocs, *DAG.getContext()); 2494 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2495 if (CCInfo.getNextStackOffset()) { 2496 MachineFunction &MF = DAG.getMachineFunction(); 2497 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2498 return false; 2499 if (Subtarget->isTargetWin64()) 2500 // Win64 ABI has additional complications. 2501 return false; 2502 2503 // Check if the arguments are already laid out in the right way as 2504 // the caller's fixed stack objects. 2505 MachineFrameInfo *MFI = MF.getFrameInfo(); 2506 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2507 const X86InstrInfo *TII = 2508 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2509 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2510 CCValAssign &VA = ArgLocs[i]; 2511 SDValue Arg = OutVals[i]; 2512 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2513 if (VA.getLocInfo() == CCValAssign::Indirect) 2514 return false; 2515 if (!VA.isRegLoc()) { 2516 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2517 MFI, MRI, TII)) 2518 return false; 2519 } 2520 } 2521 } 2522 2523 // If the tailcall address may be in a register, then make sure it's 2524 // possible to register allocate for it. In 32-bit, the call address can 2525 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2526 // callee-saved registers are restored. These happen to be the same 2527 // registers used to pass 'inreg' arguments so watch out for those. 2528 if (!Subtarget->is64Bit() && 2529 !isa<GlobalAddressSDNode>(Callee) && 2530 !isa<ExternalSymbolSDNode>(Callee)) { 2531 unsigned NumInRegs = 0; 2532 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2533 CCValAssign &VA = ArgLocs[i]; 2534 if (!VA.isRegLoc()) 2535 continue; 2536 unsigned Reg = VA.getLocReg(); 2537 switch (Reg) { 2538 default: break; 2539 case X86::EAX: case X86::EDX: case X86::ECX: 2540 if (++NumInRegs == 3) 2541 return false; 2542 break; 2543 } 2544 } 2545 } 2546 } 2547 2548 return true; 2549} 2550 2551FastISel * 2552X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2553 return X86::createFastISel(funcInfo); 2554} 2555 2556 2557//===----------------------------------------------------------------------===// 2558// Other Lowering Hooks 2559//===----------------------------------------------------------------------===// 2560 2561static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2562 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2563 2564 switch(Opc) { 2565 default: llvm_unreachable("Unknown x86 shuffle node"); 2566 case X86ISD::PSHUFHW: 2567 case X86ISD::PSHUFLW: 2568 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2569 } 2570 2571 return SDValue(); 2572} 2573 2574SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2575 MachineFunction &MF = DAG.getMachineFunction(); 2576 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2577 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2578 2579 if (ReturnAddrIndex == 0) { 2580 // Set up a frame object for the return address. 2581 uint64_t SlotSize = TD->getPointerSize(); 2582 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2583 false); 2584 FuncInfo->setRAIndex(ReturnAddrIndex); 2585 } 2586 2587 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2588} 2589 2590 2591bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2592 bool hasSymbolicDisplacement) { 2593 // Offset should fit into 32 bit immediate field. 2594 if (!isInt<32>(Offset)) 2595 return false; 2596 2597 // If we don't have a symbolic displacement - we don't have any extra 2598 // restrictions. 2599 if (!hasSymbolicDisplacement) 2600 return true; 2601 2602 // FIXME: Some tweaks might be needed for medium code model. 2603 if (M != CodeModel::Small && M != CodeModel::Kernel) 2604 return false; 2605 2606 // For small code model we assume that latest object is 16MB before end of 31 2607 // bits boundary. We may also accept pretty large negative constants knowing 2608 // that all objects are in the positive half of address space. 2609 if (M == CodeModel::Small && Offset < 16*1024*1024) 2610 return true; 2611 2612 // For kernel code model we know that all object resist in the negative half 2613 // of 32bits address space. We may not accept negative offsets, since they may 2614 // be just off and we may accept pretty large positive ones. 2615 if (M == CodeModel::Kernel && Offset > 0) 2616 return true; 2617 2618 return false; 2619} 2620 2621/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2622/// specific condition code, returning the condition code and the LHS/RHS of the 2623/// comparison to make. 2624static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2625 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2626 if (!isFP) { 2627 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2628 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2629 // X > -1 -> X == 0, jump !sign. 2630 RHS = DAG.getConstant(0, RHS.getValueType()); 2631 return X86::COND_NS; 2632 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2633 // X < 0 -> X == 0, jump on sign. 2634 return X86::COND_S; 2635 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2636 // X < 1 -> X <= 0 2637 RHS = DAG.getConstant(0, RHS.getValueType()); 2638 return X86::COND_LE; 2639 } 2640 } 2641 2642 switch (SetCCOpcode) { 2643 default: llvm_unreachable("Invalid integer condition!"); 2644 case ISD::SETEQ: return X86::COND_E; 2645 case ISD::SETGT: return X86::COND_G; 2646 case ISD::SETGE: return X86::COND_GE; 2647 case ISD::SETLT: return X86::COND_L; 2648 case ISD::SETLE: return X86::COND_LE; 2649 case ISD::SETNE: return X86::COND_NE; 2650 case ISD::SETULT: return X86::COND_B; 2651 case ISD::SETUGT: return X86::COND_A; 2652 case ISD::SETULE: return X86::COND_BE; 2653 case ISD::SETUGE: return X86::COND_AE; 2654 } 2655 } 2656 2657 // First determine if it is required or is profitable to flip the operands. 2658 2659 // If LHS is a foldable load, but RHS is not, flip the condition. 2660 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2661 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2662 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2663 std::swap(LHS, RHS); 2664 } 2665 2666 switch (SetCCOpcode) { 2667 default: break; 2668 case ISD::SETOLT: 2669 case ISD::SETOLE: 2670 case ISD::SETUGT: 2671 case ISD::SETUGE: 2672 std::swap(LHS, RHS); 2673 break; 2674 } 2675 2676 // On a floating point condition, the flags are set as follows: 2677 // ZF PF CF op 2678 // 0 | 0 | 0 | X > Y 2679 // 0 | 0 | 1 | X < Y 2680 // 1 | 0 | 0 | X == Y 2681 // 1 | 1 | 1 | unordered 2682 switch (SetCCOpcode) { 2683 default: llvm_unreachable("Condcode should be pre-legalized away"); 2684 case ISD::SETUEQ: 2685 case ISD::SETEQ: return X86::COND_E; 2686 case ISD::SETOLT: // flipped 2687 case ISD::SETOGT: 2688 case ISD::SETGT: return X86::COND_A; 2689 case ISD::SETOLE: // flipped 2690 case ISD::SETOGE: 2691 case ISD::SETGE: return X86::COND_AE; 2692 case ISD::SETUGT: // flipped 2693 case ISD::SETULT: 2694 case ISD::SETLT: return X86::COND_B; 2695 case ISD::SETUGE: // flipped 2696 case ISD::SETULE: 2697 case ISD::SETLE: return X86::COND_BE; 2698 case ISD::SETONE: 2699 case ISD::SETNE: return X86::COND_NE; 2700 case ISD::SETUO: return X86::COND_P; 2701 case ISD::SETO: return X86::COND_NP; 2702 case ISD::SETOEQ: 2703 case ISD::SETUNE: return X86::COND_INVALID; 2704 } 2705} 2706 2707/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2708/// code. Current x86 isa includes the following FP cmov instructions: 2709/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2710static bool hasFPCMov(unsigned X86CC) { 2711 switch (X86CC) { 2712 default: 2713 return false; 2714 case X86::COND_B: 2715 case X86::COND_BE: 2716 case X86::COND_E: 2717 case X86::COND_P: 2718 case X86::COND_A: 2719 case X86::COND_AE: 2720 case X86::COND_NE: 2721 case X86::COND_NP: 2722 return true; 2723 } 2724} 2725 2726/// isFPImmLegal - Returns true if the target can instruction select the 2727/// specified FP immediate natively. If false, the legalizer will 2728/// materialize the FP immediate as a load from a constant pool. 2729bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2730 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2731 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2732 return true; 2733 } 2734 return false; 2735} 2736 2737/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2738/// the specified range (L, H]. 2739static bool isUndefOrInRange(int Val, int Low, int Hi) { 2740 return (Val < 0) || (Val >= Low && Val < Hi); 2741} 2742 2743/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2744/// specified value. 2745static bool isUndefOrEqual(int Val, int CmpVal) { 2746 if (Val < 0 || Val == CmpVal) 2747 return true; 2748 return false; 2749} 2750 2751/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2752/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2753/// the second operand. 2754static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2755 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2756 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2757 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2758 return (Mask[0] < 2 && Mask[1] < 2); 2759 return false; 2760} 2761 2762bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2763 SmallVector<int, 8> M; 2764 N->getMask(M); 2765 return ::isPSHUFDMask(M, N->getValueType(0)); 2766} 2767 2768/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2769/// is suitable for input to PSHUFHW. 2770static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2771 if (VT != MVT::v8i16) 2772 return false; 2773 2774 // Lower quadword copied in order or undef. 2775 for (int i = 0; i != 4; ++i) 2776 if (Mask[i] >= 0 && Mask[i] != i) 2777 return false; 2778 2779 // Upper quadword shuffled. 2780 for (int i = 4; i != 8; ++i) 2781 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2782 return false; 2783 2784 return true; 2785} 2786 2787bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2788 SmallVector<int, 8> M; 2789 N->getMask(M); 2790 return ::isPSHUFHWMask(M, N->getValueType(0)); 2791} 2792 2793/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2794/// is suitable for input to PSHUFLW. 2795static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2796 if (VT != MVT::v8i16) 2797 return false; 2798 2799 // Upper quadword copied in order. 2800 for (int i = 4; i != 8; ++i) 2801 if (Mask[i] >= 0 && Mask[i] != i) 2802 return false; 2803 2804 // Lower quadword shuffled. 2805 for (int i = 0; i != 4; ++i) 2806 if (Mask[i] >= 4) 2807 return false; 2808 2809 return true; 2810} 2811 2812bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2813 SmallVector<int, 8> M; 2814 N->getMask(M); 2815 return ::isPSHUFLWMask(M, N->getValueType(0)); 2816} 2817 2818/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2819/// is suitable for input to PALIGNR. 2820static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2821 bool hasSSSE3) { 2822 int i, e = VT.getVectorNumElements(); 2823 2824 // Do not handle v2i64 / v2f64 shuffles with palignr. 2825 if (e < 4 || !hasSSSE3) 2826 return false; 2827 2828 for (i = 0; i != e; ++i) 2829 if (Mask[i] >= 0) 2830 break; 2831 2832 // All undef, not a palignr. 2833 if (i == e) 2834 return false; 2835 2836 // Determine if it's ok to perform a palignr with only the LHS, since we 2837 // don't have access to the actual shuffle elements to see if RHS is undef. 2838 bool Unary = Mask[i] < (int)e; 2839 bool NeedsUnary = false; 2840 2841 int s = Mask[i] - i; 2842 2843 // Check the rest of the elements to see if they are consecutive. 2844 for (++i; i != e; ++i) { 2845 int m = Mask[i]; 2846 if (m < 0) 2847 continue; 2848 2849 Unary = Unary && (m < (int)e); 2850 NeedsUnary = NeedsUnary || (m < s); 2851 2852 if (NeedsUnary && !Unary) 2853 return false; 2854 if (Unary && m != ((s+i) & (e-1))) 2855 return false; 2856 if (!Unary && m != (s+i)) 2857 return false; 2858 } 2859 return true; 2860} 2861 2862bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2863 SmallVector<int, 8> M; 2864 N->getMask(M); 2865 return ::isPALIGNRMask(M, N->getValueType(0), true); 2866} 2867 2868/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2869/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2870static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2871 int NumElems = VT.getVectorNumElements(); 2872 if (NumElems != 2 && NumElems != 4) 2873 return false; 2874 2875 int Half = NumElems / 2; 2876 for (int i = 0; i < Half; ++i) 2877 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2878 return false; 2879 for (int i = Half; i < NumElems; ++i) 2880 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2881 return false; 2882 2883 return true; 2884} 2885 2886bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2887 SmallVector<int, 8> M; 2888 N->getMask(M); 2889 return ::isSHUFPMask(M, N->getValueType(0)); 2890} 2891 2892/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2893/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2894/// half elements to come from vector 1 (which would equal the dest.) and 2895/// the upper half to come from vector 2. 2896static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2897 int NumElems = VT.getVectorNumElements(); 2898 2899 if (NumElems != 2 && NumElems != 4) 2900 return false; 2901 2902 int Half = NumElems / 2; 2903 for (int i = 0; i < Half; ++i) 2904 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2905 return false; 2906 for (int i = Half; i < NumElems; ++i) 2907 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2908 return false; 2909 return true; 2910} 2911 2912static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2913 SmallVector<int, 8> M; 2914 N->getMask(M); 2915 return isCommutedSHUFPMask(M, N->getValueType(0)); 2916} 2917 2918/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2919/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2920bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2921 if (N->getValueType(0).getVectorNumElements() != 4) 2922 return false; 2923 2924 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2925 return isUndefOrEqual(N->getMaskElt(0), 6) && 2926 isUndefOrEqual(N->getMaskElt(1), 7) && 2927 isUndefOrEqual(N->getMaskElt(2), 2) && 2928 isUndefOrEqual(N->getMaskElt(3), 3); 2929} 2930 2931/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2932/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2933/// <2, 3, 2, 3> 2934bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2935 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2936 2937 if (NumElems != 4) 2938 return false; 2939 2940 return isUndefOrEqual(N->getMaskElt(0), 2) && 2941 isUndefOrEqual(N->getMaskElt(1), 3) && 2942 isUndefOrEqual(N->getMaskElt(2), 2) && 2943 isUndefOrEqual(N->getMaskElt(3), 3); 2944} 2945 2946/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2947/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2948bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2949 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2950 2951 if (NumElems != 2 && NumElems != 4) 2952 return false; 2953 2954 for (unsigned i = 0; i < NumElems/2; ++i) 2955 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2956 return false; 2957 2958 for (unsigned i = NumElems/2; i < NumElems; ++i) 2959 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2960 return false; 2961 2962 return true; 2963} 2964 2965/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2966/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2967bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2968 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2969 2970 if (NumElems != 2 && NumElems != 4) 2971 return false; 2972 2973 for (unsigned i = 0; i < NumElems/2; ++i) 2974 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2975 return false; 2976 2977 for (unsigned i = 0; i < NumElems/2; ++i) 2978 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2979 return false; 2980 2981 return true; 2982} 2983 2984/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2985/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2986static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2987 bool V2IsSplat = false) { 2988 int NumElts = VT.getVectorNumElements(); 2989 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2990 return false; 2991 2992 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2993 int BitI = Mask[i]; 2994 int BitI1 = Mask[i+1]; 2995 if (!isUndefOrEqual(BitI, j)) 2996 return false; 2997 if (V2IsSplat) { 2998 if (!isUndefOrEqual(BitI1, NumElts)) 2999 return false; 3000 } else { 3001 if (!isUndefOrEqual(BitI1, j + NumElts)) 3002 return false; 3003 } 3004 } 3005 return true; 3006} 3007 3008bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3009 SmallVector<int, 8> M; 3010 N->getMask(M); 3011 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3012} 3013 3014/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3015/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3016static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3017 bool V2IsSplat = false) { 3018 int NumElts = VT.getVectorNumElements(); 3019 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3020 return false; 3021 3022 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3023 int BitI = Mask[i]; 3024 int BitI1 = Mask[i+1]; 3025 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3026 return false; 3027 if (V2IsSplat) { 3028 if (isUndefOrEqual(BitI1, NumElts)) 3029 return false; 3030 } else { 3031 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3032 return false; 3033 } 3034 } 3035 return true; 3036} 3037 3038bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3039 SmallVector<int, 8> M; 3040 N->getMask(M); 3041 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3042} 3043 3044/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3045/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3046/// <0, 0, 1, 1> 3047static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3048 int NumElems = VT.getVectorNumElements(); 3049 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3050 return false; 3051 3052 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3053 int BitI = Mask[i]; 3054 int BitI1 = Mask[i+1]; 3055 if (!isUndefOrEqual(BitI, j)) 3056 return false; 3057 if (!isUndefOrEqual(BitI1, j)) 3058 return false; 3059 } 3060 return true; 3061} 3062 3063bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3064 SmallVector<int, 8> M; 3065 N->getMask(M); 3066 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3067} 3068 3069/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3070/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3071/// <2, 2, 3, 3> 3072static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3073 int NumElems = VT.getVectorNumElements(); 3074 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3075 return false; 3076 3077 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3078 int BitI = Mask[i]; 3079 int BitI1 = Mask[i+1]; 3080 if (!isUndefOrEqual(BitI, j)) 3081 return false; 3082 if (!isUndefOrEqual(BitI1, j)) 3083 return false; 3084 } 3085 return true; 3086} 3087 3088bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3089 SmallVector<int, 8> M; 3090 N->getMask(M); 3091 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3092} 3093 3094/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3095/// specifies a shuffle of elements that is suitable for input to MOVSS, 3096/// MOVSD, and MOVD, i.e. setting the lowest element. 3097static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3098 if (VT.getVectorElementType().getSizeInBits() < 32) 3099 return false; 3100 3101 int NumElts = VT.getVectorNumElements(); 3102 3103 if (!isUndefOrEqual(Mask[0], NumElts)) 3104 return false; 3105 3106 for (int i = 1; i < NumElts; ++i) 3107 if (!isUndefOrEqual(Mask[i], i)) 3108 return false; 3109 3110 return true; 3111} 3112 3113bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3114 SmallVector<int, 8> M; 3115 N->getMask(M); 3116 return ::isMOVLMask(M, N->getValueType(0)); 3117} 3118 3119/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3120/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3121/// element of vector 2 and the other elements to come from vector 1 in order. 3122static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3123 bool V2IsSplat = false, bool V2IsUndef = false) { 3124 int NumOps = VT.getVectorNumElements(); 3125 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3126 return false; 3127 3128 if (!isUndefOrEqual(Mask[0], 0)) 3129 return false; 3130 3131 for (int i = 1; i < NumOps; ++i) 3132 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3133 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3134 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3135 return false; 3136 3137 return true; 3138} 3139 3140static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3141 bool V2IsUndef = false) { 3142 SmallVector<int, 8> M; 3143 N->getMask(M); 3144 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3145} 3146 3147/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3148/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3149bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3150 if (N->getValueType(0).getVectorNumElements() != 4) 3151 return false; 3152 3153 // Expect 1, 1, 3, 3 3154 for (unsigned i = 0; i < 2; ++i) { 3155 int Elt = N->getMaskElt(i); 3156 if (Elt >= 0 && Elt != 1) 3157 return false; 3158 } 3159 3160 bool HasHi = false; 3161 for (unsigned i = 2; i < 4; ++i) { 3162 int Elt = N->getMaskElt(i); 3163 if (Elt >= 0 && Elt != 3) 3164 return false; 3165 if (Elt == 3) 3166 HasHi = true; 3167 } 3168 // Don't use movshdup if it can be done with a shufps. 3169 // FIXME: verify that matching u, u, 3, 3 is what we want. 3170 return HasHi; 3171} 3172 3173/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3174/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3175bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3176 if (N->getValueType(0).getVectorNumElements() != 4) 3177 return false; 3178 3179 // Expect 0, 0, 2, 2 3180 for (unsigned i = 0; i < 2; ++i) 3181 if (N->getMaskElt(i) > 0) 3182 return false; 3183 3184 bool HasHi = false; 3185 for (unsigned i = 2; i < 4; ++i) { 3186 int Elt = N->getMaskElt(i); 3187 if (Elt >= 0 && Elt != 2) 3188 return false; 3189 if (Elt == 2) 3190 HasHi = true; 3191 } 3192 // Don't use movsldup if it can be done with a shufps. 3193 return HasHi; 3194} 3195 3196/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3197/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3198bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3199 int e = N->getValueType(0).getVectorNumElements() / 2; 3200 3201 for (int i = 0; i < e; ++i) 3202 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3203 return false; 3204 for (int i = 0; i < e; ++i) 3205 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3206 return false; 3207 return true; 3208} 3209 3210/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3211/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3212unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3213 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3214 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3215 3216 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3217 unsigned Mask = 0; 3218 for (int i = 0; i < NumOperands; ++i) { 3219 int Val = SVOp->getMaskElt(NumOperands-i-1); 3220 if (Val < 0) Val = 0; 3221 if (Val >= NumOperands) Val -= NumOperands; 3222 Mask |= Val; 3223 if (i != NumOperands - 1) 3224 Mask <<= Shift; 3225 } 3226 return Mask; 3227} 3228 3229/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3230/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3231unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3232 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3233 unsigned Mask = 0; 3234 // 8 nodes, but we only care about the last 4. 3235 for (unsigned i = 7; i >= 4; --i) { 3236 int Val = SVOp->getMaskElt(i); 3237 if (Val >= 0) 3238 Mask |= (Val - 4); 3239 if (i != 4) 3240 Mask <<= 2; 3241 } 3242 return Mask; 3243} 3244 3245/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3246/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3247unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3248 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3249 unsigned Mask = 0; 3250 // 8 nodes, but we only care about the first 4. 3251 for (int i = 3; i >= 0; --i) { 3252 int Val = SVOp->getMaskElt(i); 3253 if (Val >= 0) 3254 Mask |= Val; 3255 if (i != 0) 3256 Mask <<= 2; 3257 } 3258 return Mask; 3259} 3260 3261/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3262/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3263unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3264 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3265 EVT VVT = N->getValueType(0); 3266 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3267 int Val = 0; 3268 3269 unsigned i, e; 3270 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3271 Val = SVOp->getMaskElt(i); 3272 if (Val >= 0) 3273 break; 3274 } 3275 return (Val - i) * EltSize; 3276} 3277 3278/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3279/// constant +0.0. 3280bool X86::isZeroNode(SDValue Elt) { 3281 return ((isa<ConstantSDNode>(Elt) && 3282 cast<ConstantSDNode>(Elt)->isNullValue()) || 3283 (isa<ConstantFPSDNode>(Elt) && 3284 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3285} 3286 3287/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3288/// their permute mask. 3289static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3290 SelectionDAG &DAG) { 3291 EVT VT = SVOp->getValueType(0); 3292 unsigned NumElems = VT.getVectorNumElements(); 3293 SmallVector<int, 8> MaskVec; 3294 3295 for (unsigned i = 0; i != NumElems; ++i) { 3296 int idx = SVOp->getMaskElt(i); 3297 if (idx < 0) 3298 MaskVec.push_back(idx); 3299 else if (idx < (int)NumElems) 3300 MaskVec.push_back(idx + NumElems); 3301 else 3302 MaskVec.push_back(idx - NumElems); 3303 } 3304 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3305 SVOp->getOperand(0), &MaskVec[0]); 3306} 3307 3308/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3309/// the two vector operands have swapped position. 3310static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3311 unsigned NumElems = VT.getVectorNumElements(); 3312 for (unsigned i = 0; i != NumElems; ++i) { 3313 int idx = Mask[i]; 3314 if (idx < 0) 3315 continue; 3316 else if (idx < (int)NumElems) 3317 Mask[i] = idx + NumElems; 3318 else 3319 Mask[i] = idx - NumElems; 3320 } 3321} 3322 3323/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3324/// match movhlps. The lower half elements should come from upper half of 3325/// V1 (and in order), and the upper half elements should come from the upper 3326/// half of V2 (and in order). 3327static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3328 if (Op->getValueType(0).getVectorNumElements() != 4) 3329 return false; 3330 for (unsigned i = 0, e = 2; i != e; ++i) 3331 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3332 return false; 3333 for (unsigned i = 2; i != 4; ++i) 3334 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3335 return false; 3336 return true; 3337} 3338 3339/// isScalarLoadToVector - Returns true if the node is a scalar load that 3340/// is promoted to a vector. It also returns the LoadSDNode by reference if 3341/// required. 3342static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3343 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3344 return false; 3345 N = N->getOperand(0).getNode(); 3346 if (!ISD::isNON_EXTLoad(N)) 3347 return false; 3348 if (LD) 3349 *LD = cast<LoadSDNode>(N); 3350 return true; 3351} 3352 3353/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3354/// match movlp{s|d}. The lower half elements should come from lower half of 3355/// V1 (and in order), and the upper half elements should come from the upper 3356/// half of V2 (and in order). And since V1 will become the source of the 3357/// MOVLP, it must be either a vector load or a scalar load to vector. 3358static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3359 ShuffleVectorSDNode *Op) { 3360 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3361 return false; 3362 // Is V2 is a vector load, don't do this transformation. We will try to use 3363 // load folding shufps op. 3364 if (ISD::isNON_EXTLoad(V2)) 3365 return false; 3366 3367 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3368 3369 if (NumElems != 2 && NumElems != 4) 3370 return false; 3371 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3372 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3373 return false; 3374 for (unsigned i = NumElems/2; i != NumElems; ++i) 3375 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3376 return false; 3377 return true; 3378} 3379 3380/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3381/// all the same. 3382static bool isSplatVector(SDNode *N) { 3383 if (N->getOpcode() != ISD::BUILD_VECTOR) 3384 return false; 3385 3386 SDValue SplatValue = N->getOperand(0); 3387 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3388 if (N->getOperand(i) != SplatValue) 3389 return false; 3390 return true; 3391} 3392 3393/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3394/// to an zero vector. 3395/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3396static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3397 SDValue V1 = N->getOperand(0); 3398 SDValue V2 = N->getOperand(1); 3399 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3400 for (unsigned i = 0; i != NumElems; ++i) { 3401 int Idx = N->getMaskElt(i); 3402 if (Idx >= (int)NumElems) { 3403 unsigned Opc = V2.getOpcode(); 3404 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3405 continue; 3406 if (Opc != ISD::BUILD_VECTOR || 3407 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3408 return false; 3409 } else if (Idx >= 0) { 3410 unsigned Opc = V1.getOpcode(); 3411 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3412 continue; 3413 if (Opc != ISD::BUILD_VECTOR || 3414 !X86::isZeroNode(V1.getOperand(Idx))) 3415 return false; 3416 } 3417 } 3418 return true; 3419} 3420 3421/// getZeroVector - Returns a vector of specified type with all zero elements. 3422/// 3423static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3424 DebugLoc dl) { 3425 assert(VT.isVector() && "Expected a vector type"); 3426 3427 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted 3428 // to their dest type. This ensures they get CSE'd. 3429 SDValue Vec; 3430 if (VT.getSizeInBits() == 64) { // MMX 3431 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3432 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3433 } else if (VT.getSizeInBits() == 128) { 3434 if (HasSSE2) { // SSE2 3435 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3436 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3437 } else { // SSE1 3438 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3439 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3440 } 3441 } else if (VT.getSizeInBits() == 256) { // AVX 3442 // 256-bit logic and arithmetic instructions in AVX are 3443 // all floating-point, no support for integer ops. Default 3444 // to emitting fp zeroed vectors then. 3445 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3446 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3447 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3448 } 3449 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3450} 3451 3452/// getOnesVector - Returns a vector of specified type with all bits set. 3453/// 3454static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3455 assert(VT.isVector() && "Expected a vector type"); 3456 3457 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3458 // type. This ensures they get CSE'd. 3459 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3460 SDValue Vec; 3461 if (VT.getSizeInBits() == 64) // MMX 3462 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3463 else // SSE 3464 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3465 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3466} 3467 3468 3469/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3470/// that point to V2 points to its first element. 3471static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3472 EVT VT = SVOp->getValueType(0); 3473 unsigned NumElems = VT.getVectorNumElements(); 3474 3475 bool Changed = false; 3476 SmallVector<int, 8> MaskVec; 3477 SVOp->getMask(MaskVec); 3478 3479 for (unsigned i = 0; i != NumElems; ++i) { 3480 if (MaskVec[i] > (int)NumElems) { 3481 MaskVec[i] = NumElems; 3482 Changed = true; 3483 } 3484 } 3485 if (Changed) 3486 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3487 SVOp->getOperand(1), &MaskVec[0]); 3488 return SDValue(SVOp, 0); 3489} 3490 3491/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3492/// operation of specified width. 3493static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3494 SDValue V2) { 3495 unsigned NumElems = VT.getVectorNumElements(); 3496 SmallVector<int, 8> Mask; 3497 Mask.push_back(NumElems); 3498 for (unsigned i = 1; i != NumElems; ++i) 3499 Mask.push_back(i); 3500 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3501} 3502 3503/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3504static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3505 SDValue V2) { 3506 unsigned NumElems = VT.getVectorNumElements(); 3507 SmallVector<int, 8> Mask; 3508 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3509 Mask.push_back(i); 3510 Mask.push_back(i + NumElems); 3511 } 3512 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3513} 3514 3515/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3516static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3517 SDValue V2) { 3518 unsigned NumElems = VT.getVectorNumElements(); 3519 unsigned Half = NumElems/2; 3520 SmallVector<int, 8> Mask; 3521 for (unsigned i = 0; i != Half; ++i) { 3522 Mask.push_back(i + Half); 3523 Mask.push_back(i + NumElems + Half); 3524 } 3525 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3526} 3527 3528/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3529static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3530 if (SV->getValueType(0).getVectorNumElements() <= 4) 3531 return SDValue(SV, 0); 3532 3533 EVT PVT = MVT::v4f32; 3534 EVT VT = SV->getValueType(0); 3535 DebugLoc dl = SV->getDebugLoc(); 3536 SDValue V1 = SV->getOperand(0); 3537 int NumElems = VT.getVectorNumElements(); 3538 int EltNo = SV->getSplatIndex(); 3539 3540 // unpack elements to the correct location 3541 while (NumElems > 4) { 3542 if (EltNo < NumElems/2) { 3543 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3544 } else { 3545 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3546 EltNo -= NumElems/2; 3547 } 3548 NumElems >>= 1; 3549 } 3550 3551 // Perform the splat. 3552 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3553 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3554 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3555 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3556} 3557 3558/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3559/// vector of zero or undef vector. This produces a shuffle where the low 3560/// element of V2 is swizzled into the zero/undef vector, landing at element 3561/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3562static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3563 bool isZero, bool HasSSE2, 3564 SelectionDAG &DAG) { 3565 EVT VT = V2.getValueType(); 3566 SDValue V1 = isZero 3567 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3568 unsigned NumElems = VT.getVectorNumElements(); 3569 SmallVector<int, 16> MaskVec; 3570 for (unsigned i = 0; i != NumElems; ++i) 3571 // If this is the insertion idx, put the low elt of V2 here. 3572 MaskVec.push_back(i == Idx ? NumElems : i); 3573 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3574} 3575 3576/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3577/// a shuffle that is zero. 3578static 3579unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3580 bool Low, SelectionDAG &DAG) { 3581 unsigned NumZeros = 0; 3582 for (int i = 0; i < NumElems; ++i) { 3583 unsigned Index = Low ? i : NumElems-i-1; 3584 int Idx = SVOp->getMaskElt(Index); 3585 if (Idx < 0) { 3586 ++NumZeros; 3587 continue; 3588 } 3589 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3590 if (Elt.getNode() && X86::isZeroNode(Elt)) 3591 ++NumZeros; 3592 else 3593 break; 3594 } 3595 return NumZeros; 3596} 3597 3598/// isVectorShift - Returns true if the shuffle can be implemented as a 3599/// logical left or right shift of a vector. 3600/// FIXME: split into pslldqi, psrldqi, palignr variants. 3601static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3602 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3603 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3604 3605 isLeft = true; 3606 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3607 if (!NumZeros) { 3608 isLeft = false; 3609 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3610 if (!NumZeros) 3611 return false; 3612 } 3613 bool SeenV1 = false; 3614 bool SeenV2 = false; 3615 for (unsigned i = NumZeros; i < NumElems; ++i) { 3616 unsigned Val = isLeft ? (i - NumZeros) : i; 3617 int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3618 if (Idx_ < 0) 3619 continue; 3620 unsigned Idx = (unsigned) Idx_; 3621 if (Idx < NumElems) 3622 SeenV1 = true; 3623 else { 3624 Idx -= NumElems; 3625 SeenV2 = true; 3626 } 3627 if (Idx != Val) 3628 return false; 3629 } 3630 if (SeenV1 && SeenV2) 3631 return false; 3632 3633 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3634 ShAmt = NumZeros; 3635 return true; 3636} 3637 3638 3639/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3640/// 3641static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3642 unsigned NumNonZero, unsigned NumZero, 3643 SelectionDAG &DAG, 3644 const TargetLowering &TLI) { 3645 if (NumNonZero > 8) 3646 return SDValue(); 3647 3648 DebugLoc dl = Op.getDebugLoc(); 3649 SDValue V(0, 0); 3650 bool First = true; 3651 for (unsigned i = 0; i < 16; ++i) { 3652 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3653 if (ThisIsNonZero && First) { 3654 if (NumZero) 3655 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3656 else 3657 V = DAG.getUNDEF(MVT::v8i16); 3658 First = false; 3659 } 3660 3661 if ((i & 1) != 0) { 3662 SDValue ThisElt(0, 0), LastElt(0, 0); 3663 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3664 if (LastIsNonZero) { 3665 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3666 MVT::i16, Op.getOperand(i-1)); 3667 } 3668 if (ThisIsNonZero) { 3669 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3670 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3671 ThisElt, DAG.getConstant(8, MVT::i8)); 3672 if (LastIsNonZero) 3673 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3674 } else 3675 ThisElt = LastElt; 3676 3677 if (ThisElt.getNode()) 3678 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3679 DAG.getIntPtrConstant(i/2)); 3680 } 3681 } 3682 3683 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3684} 3685 3686/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3687/// 3688static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3689 unsigned NumNonZero, unsigned NumZero, 3690 SelectionDAG &DAG, 3691 const TargetLowering &TLI) { 3692 if (NumNonZero > 4) 3693 return SDValue(); 3694 3695 DebugLoc dl = Op.getDebugLoc(); 3696 SDValue V(0, 0); 3697 bool First = true; 3698 for (unsigned i = 0; i < 8; ++i) { 3699 bool isNonZero = (NonZeros & (1 << i)) != 0; 3700 if (isNonZero) { 3701 if (First) { 3702 if (NumZero) 3703 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3704 else 3705 V = DAG.getUNDEF(MVT::v8i16); 3706 First = false; 3707 } 3708 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3709 MVT::v8i16, V, Op.getOperand(i), 3710 DAG.getIntPtrConstant(i)); 3711 } 3712 } 3713 3714 return V; 3715} 3716 3717/// getVShift - Return a vector logical shift node. 3718/// 3719static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3720 unsigned NumBits, SelectionDAG &DAG, 3721 const TargetLowering &TLI, DebugLoc dl) { 3722 bool isMMX = VT.getSizeInBits() == 64; 3723 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3724 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3725 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3726 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3727 DAG.getNode(Opc, dl, ShVT, SrcOp, 3728 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3729} 3730 3731SDValue 3732X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3733 SelectionDAG &DAG) const { 3734 3735 // Check if the scalar load can be widened into a vector load. And if 3736 // the address is "base + cst" see if the cst can be "absorbed" into 3737 // the shuffle mask. 3738 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3739 SDValue Ptr = LD->getBasePtr(); 3740 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3741 return SDValue(); 3742 EVT PVT = LD->getValueType(0); 3743 if (PVT != MVT::i32 && PVT != MVT::f32) 3744 return SDValue(); 3745 3746 int FI = -1; 3747 int64_t Offset = 0; 3748 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3749 FI = FINode->getIndex(); 3750 Offset = 0; 3751 } else if (Ptr.getOpcode() == ISD::ADD && 3752 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3753 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3754 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3755 Offset = Ptr.getConstantOperandVal(1); 3756 Ptr = Ptr.getOperand(0); 3757 } else { 3758 return SDValue(); 3759 } 3760 3761 SDValue Chain = LD->getChain(); 3762 // Make sure the stack object alignment is at least 16. 3763 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3764 if (DAG.InferPtrAlignment(Ptr) < 16) { 3765 if (MFI->isFixedObjectIndex(FI)) { 3766 // Can't change the alignment. FIXME: It's possible to compute 3767 // the exact stack offset and reference FI + adjust offset instead. 3768 // If someone *really* cares about this. That's the way to implement it. 3769 return SDValue(); 3770 } else { 3771 MFI->setObjectAlignment(FI, 16); 3772 } 3773 } 3774 3775 // (Offset % 16) must be multiple of 4. Then address is then 3776 // Ptr + (Offset & ~15). 3777 if (Offset < 0) 3778 return SDValue(); 3779 if ((Offset % 16) & 3) 3780 return SDValue(); 3781 int64_t StartOffset = Offset & ~15; 3782 if (StartOffset) 3783 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3784 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3785 3786 int EltNo = (Offset - StartOffset) >> 2; 3787 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3788 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3789 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3790 false, false, 0); 3791 // Canonicalize it to a v4i32 shuffle. 3792 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3793 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3794 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3795 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3796 } 3797 3798 return SDValue(); 3799} 3800 3801/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 3802/// vector of type 'VT', see if the elements can be replaced by a single large 3803/// load which has the same value as a build_vector whose operands are 'elts'. 3804/// 3805/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 3806/// 3807/// FIXME: we'd also like to handle the case where the last elements are zero 3808/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 3809/// There's even a handy isZeroNode for that purpose. 3810static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 3811 DebugLoc &dl, SelectionDAG &DAG) { 3812 EVT EltVT = VT.getVectorElementType(); 3813 unsigned NumElems = Elts.size(); 3814 3815 LoadSDNode *LDBase = NULL; 3816 unsigned LastLoadedElt = -1U; 3817 3818 // For each element in the initializer, see if we've found a load or an undef. 3819 // If we don't find an initial load element, or later load elements are 3820 // non-consecutive, bail out. 3821 for (unsigned i = 0; i < NumElems; ++i) { 3822 SDValue Elt = Elts[i]; 3823 3824 if (!Elt.getNode() || 3825 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 3826 return SDValue(); 3827 if (!LDBase) { 3828 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 3829 return SDValue(); 3830 LDBase = cast<LoadSDNode>(Elt.getNode()); 3831 LastLoadedElt = i; 3832 continue; 3833 } 3834 if (Elt.getOpcode() == ISD::UNDEF) 3835 continue; 3836 3837 LoadSDNode *LD = cast<LoadSDNode>(Elt); 3838 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 3839 return SDValue(); 3840 LastLoadedElt = i; 3841 } 3842 3843 // If we have found an entire vector of loads and undefs, then return a large 3844 // load of the entire vector width starting at the base pointer. If we found 3845 // consecutive loads for the low half, generate a vzext_load node. 3846 if (LastLoadedElt == NumElems - 1) { 3847 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 3848 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3849 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3850 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 3851 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3852 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3853 LDBase->isVolatile(), LDBase->isNonTemporal(), 3854 LDBase->getAlignment()); 3855 } else if (NumElems == 4 && LastLoadedElt == 1) { 3856 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 3857 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 3858 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 3859 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 3860 } 3861 return SDValue(); 3862} 3863 3864SDValue 3865X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 3866 DebugLoc dl = Op.getDebugLoc(); 3867 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1 and 3868 // all one's are handled with pcmpeqd. In AVX, zero's are handled with 3869 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 3870 // is present, so AllOnes is ignored. 3871 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 3872 (Op.getValueType().getSizeInBits() != 256 && 3873 ISD::isBuildVectorAllOnes(Op.getNode()))) { 3874 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3875 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3876 // eliminated on x86-32 hosts. 3877 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3878 return Op; 3879 3880 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3881 return getOnesVector(Op.getValueType(), DAG, dl); 3882 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3883 } 3884 3885 EVT VT = Op.getValueType(); 3886 EVT ExtVT = VT.getVectorElementType(); 3887 unsigned EVTBits = ExtVT.getSizeInBits(); 3888 3889 unsigned NumElems = Op.getNumOperands(); 3890 unsigned NumZero = 0; 3891 unsigned NumNonZero = 0; 3892 unsigned NonZeros = 0; 3893 bool IsAllConstants = true; 3894 SmallSet<SDValue, 8> Values; 3895 for (unsigned i = 0; i < NumElems; ++i) { 3896 SDValue Elt = Op.getOperand(i); 3897 if (Elt.getOpcode() == ISD::UNDEF) 3898 continue; 3899 Values.insert(Elt); 3900 if (Elt.getOpcode() != ISD::Constant && 3901 Elt.getOpcode() != ISD::ConstantFP) 3902 IsAllConstants = false; 3903 if (X86::isZeroNode(Elt)) 3904 NumZero++; 3905 else { 3906 NonZeros |= (1 << i); 3907 NumNonZero++; 3908 } 3909 } 3910 3911 if (NumNonZero == 0) { 3912 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3913 return DAG.getUNDEF(VT); 3914 } 3915 3916 // Special case for single non-zero, non-undef, element. 3917 if (NumNonZero == 1) { 3918 unsigned Idx = CountTrailingZeros_32(NonZeros); 3919 SDValue Item = Op.getOperand(Idx); 3920 3921 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3922 // the value are obviously zero, truncate the value to i32 and do the 3923 // insertion that way. Only do this if the value is non-constant or if the 3924 // value is a constant being inserted into element 0. It is cheaper to do 3925 // a constant pool load than it is to do a movd + shuffle. 3926 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3927 (!IsAllConstants || Idx == 0)) { 3928 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3929 // Handle MMX and SSE both. 3930 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3931 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3932 3933 // Truncate the value (which may itself be a constant) to i32, and 3934 // convert it to a vector with movd (S2V+shuffle to zero extend). 3935 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3936 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3937 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3938 Subtarget->hasSSE2(), DAG); 3939 3940 // Now we have our 32-bit value zero extended in the low element of 3941 // a vector. If Idx != 0, swizzle it into place. 3942 if (Idx != 0) { 3943 SmallVector<int, 4> Mask; 3944 Mask.push_back(Idx); 3945 for (unsigned i = 1; i != VecElts; ++i) 3946 Mask.push_back(i); 3947 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3948 DAG.getUNDEF(Item.getValueType()), 3949 &Mask[0]); 3950 } 3951 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3952 } 3953 } 3954 3955 // If we have a constant or non-constant insertion into the low element of 3956 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3957 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3958 // depending on what the source datatype is. 3959 if (Idx == 0) { 3960 if (NumZero == 0) { 3961 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3962 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3963 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3964 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3965 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3966 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3967 DAG); 3968 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3969 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3970 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3971 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3972 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3973 Subtarget->hasSSE2(), DAG); 3974 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3975 } 3976 } 3977 3978 // Is it a vector logical left shift? 3979 if (NumElems == 2 && Idx == 1 && 3980 X86::isZeroNode(Op.getOperand(0)) && 3981 !X86::isZeroNode(Op.getOperand(1))) { 3982 unsigned NumBits = VT.getSizeInBits(); 3983 return getVShift(true, VT, 3984 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3985 VT, Op.getOperand(1)), 3986 NumBits/2, DAG, *this, dl); 3987 } 3988 3989 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3990 return SDValue(); 3991 3992 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3993 // is a non-constant being inserted into an element other than the low one, 3994 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3995 // movd/movss) to move this into the low element, then shuffle it into 3996 // place. 3997 if (EVTBits == 32) { 3998 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3999 4000 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4001 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4002 Subtarget->hasSSE2(), DAG); 4003 SmallVector<int, 8> MaskVec; 4004 for (unsigned i = 0; i < NumElems; i++) 4005 MaskVec.push_back(i == Idx ? 0 : 1); 4006 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4007 } 4008 } 4009 4010 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4011 if (Values.size() == 1) { 4012 if (EVTBits == 32) { 4013 // Instead of a shuffle like this: 4014 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4015 // Check if it's possible to issue this instead. 4016 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4017 unsigned Idx = CountTrailingZeros_32(NonZeros); 4018 SDValue Item = Op.getOperand(Idx); 4019 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4020 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4021 } 4022 return SDValue(); 4023 } 4024 4025 // A vector full of immediates; various special cases are already 4026 // handled, so this is best done with a single constant-pool load. 4027 if (IsAllConstants) 4028 return SDValue(); 4029 4030 // Let legalizer expand 2-wide build_vectors. 4031 if (EVTBits == 64) { 4032 if (NumNonZero == 1) { 4033 // One half is zero or undef. 4034 unsigned Idx = CountTrailingZeros_32(NonZeros); 4035 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4036 Op.getOperand(Idx)); 4037 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4038 Subtarget->hasSSE2(), DAG); 4039 } 4040 return SDValue(); 4041 } 4042 4043 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4044 if (EVTBits == 8 && NumElems == 16) { 4045 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4046 *this); 4047 if (V.getNode()) return V; 4048 } 4049 4050 if (EVTBits == 16 && NumElems == 8) { 4051 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4052 *this); 4053 if (V.getNode()) return V; 4054 } 4055 4056 // If element VT is == 32 bits, turn it into a number of shuffles. 4057 SmallVector<SDValue, 8> V; 4058 V.resize(NumElems); 4059 if (NumElems == 4 && NumZero > 0) { 4060 for (unsigned i = 0; i < 4; ++i) { 4061 bool isZero = !(NonZeros & (1 << i)); 4062 if (isZero) 4063 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4064 else 4065 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4066 } 4067 4068 for (unsigned i = 0; i < 2; ++i) { 4069 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4070 default: break; 4071 case 0: 4072 V[i] = V[i*2]; // Must be a zero vector. 4073 break; 4074 case 1: 4075 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4076 break; 4077 case 2: 4078 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4079 break; 4080 case 3: 4081 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4082 break; 4083 } 4084 } 4085 4086 SmallVector<int, 8> MaskVec; 4087 bool Reverse = (NonZeros & 0x3) == 2; 4088 for (unsigned i = 0; i < 2; ++i) 4089 MaskVec.push_back(Reverse ? 1-i : i); 4090 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4091 for (unsigned i = 0; i < 2; ++i) 4092 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4093 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4094 } 4095 4096 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4097 // Check for a build vector of consecutive loads. 4098 for (unsigned i = 0; i < NumElems; ++i) 4099 V[i] = Op.getOperand(i); 4100 4101 // Check for elements which are consecutive loads. 4102 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4103 if (LD.getNode()) 4104 return LD; 4105 4106 // For SSE 4.1, use inserts into undef. 4107 if (getSubtarget()->hasSSE41()) { 4108 V[0] = DAG.getUNDEF(VT); 4109 for (unsigned i = 0; i < NumElems; ++i) 4110 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4111 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 4112 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4113 return V[0]; 4114 } 4115 4116 // Otherwise, expand into a number of unpckl* 4117 // e.g. for v4f32 4118 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4119 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4120 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4121 for (unsigned i = 0; i < NumElems; ++i) 4122 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4123 NumElems >>= 1; 4124 while (NumElems != 0) { 4125 for (unsigned i = 0; i < NumElems; ++i) 4126 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 4127 NumElems >>= 1; 4128 } 4129 return V[0]; 4130 } 4131 return SDValue(); 4132} 4133 4134SDValue 4135X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4136 // We support concatenate two MMX registers and place them in a MMX 4137 // register. This is better than doing a stack convert. 4138 DebugLoc dl = Op.getDebugLoc(); 4139 EVT ResVT = Op.getValueType(); 4140 assert(Op.getNumOperands() == 2); 4141 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4142 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4143 int Mask[2]; 4144 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4145 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4146 InVec = Op.getOperand(1); 4147 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4148 unsigned NumElts = ResVT.getVectorNumElements(); 4149 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4150 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4151 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4152 } else { 4153 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4154 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4155 Mask[0] = 0; Mask[1] = 2; 4156 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4157 } 4158 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4159} 4160 4161// v8i16 shuffles - Prefer shuffles in the following order: 4162// 1. [all] pshuflw, pshufhw, optional move 4163// 2. [ssse3] 1 x pshufb 4164// 3. [ssse3] 2 x pshufb + 1 x por 4165// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4166SDValue 4167X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4168 SelectionDAG &DAG) const { 4169 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4170 SDValue V1 = SVOp->getOperand(0); 4171 SDValue V2 = SVOp->getOperand(1); 4172 DebugLoc dl = SVOp->getDebugLoc(); 4173 SmallVector<int, 8> MaskVals; 4174 4175 // Determine if more than 1 of the words in each of the low and high quadwords 4176 // of the result come from the same quadword of one of the two inputs. Undef 4177 // mask values count as coming from any quadword, for better codegen. 4178 SmallVector<unsigned, 4> LoQuad(4); 4179 SmallVector<unsigned, 4> HiQuad(4); 4180 BitVector InputQuads(4); 4181 for (unsigned i = 0; i < 8; ++i) { 4182 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4183 int EltIdx = SVOp->getMaskElt(i); 4184 MaskVals.push_back(EltIdx); 4185 if (EltIdx < 0) { 4186 ++Quad[0]; 4187 ++Quad[1]; 4188 ++Quad[2]; 4189 ++Quad[3]; 4190 continue; 4191 } 4192 ++Quad[EltIdx / 4]; 4193 InputQuads.set(EltIdx / 4); 4194 } 4195 4196 int BestLoQuad = -1; 4197 unsigned MaxQuad = 1; 4198 for (unsigned i = 0; i < 4; ++i) { 4199 if (LoQuad[i] > MaxQuad) { 4200 BestLoQuad = i; 4201 MaxQuad = LoQuad[i]; 4202 } 4203 } 4204 4205 int BestHiQuad = -1; 4206 MaxQuad = 1; 4207 for (unsigned i = 0; i < 4; ++i) { 4208 if (HiQuad[i] > MaxQuad) { 4209 BestHiQuad = i; 4210 MaxQuad = HiQuad[i]; 4211 } 4212 } 4213 4214 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4215 // of the two input vectors, shuffle them into one input vector so only a 4216 // single pshufb instruction is necessary. If There are more than 2 input 4217 // quads, disable the next transformation since it does not help SSSE3. 4218 bool V1Used = InputQuads[0] || InputQuads[1]; 4219 bool V2Used = InputQuads[2] || InputQuads[3]; 4220 if (Subtarget->hasSSSE3()) { 4221 if (InputQuads.count() == 2 && V1Used && V2Used) { 4222 BestLoQuad = InputQuads.find_first(); 4223 BestHiQuad = InputQuads.find_next(BestLoQuad); 4224 } 4225 if (InputQuads.count() > 2) { 4226 BestLoQuad = -1; 4227 BestHiQuad = -1; 4228 } 4229 } 4230 4231 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4232 // the shuffle mask. If a quad is scored as -1, that means that it contains 4233 // words from all 4 input quadwords. 4234 SDValue NewV; 4235 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4236 SmallVector<int, 8> MaskV; 4237 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4238 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4239 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4240 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4241 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4242 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE) 4243 NewV = LowerVECTOR_SHUFFLE(NewV, DAG); 4244 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4245 4246 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4247 // source words for the shuffle, to aid later transformations. 4248 bool AllWordsInNewV = true; 4249 bool InOrder[2] = { true, true }; 4250 for (unsigned i = 0; i != 8; ++i) { 4251 int idx = MaskVals[i]; 4252 if (idx != (int)i) 4253 InOrder[i/4] = false; 4254 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4255 continue; 4256 AllWordsInNewV = false; 4257 break; 4258 } 4259 4260 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4261 if (AllWordsInNewV) { 4262 for (int i = 0; i != 8; ++i) { 4263 int idx = MaskVals[i]; 4264 if (idx < 0) 4265 continue; 4266 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4267 if ((idx != i) && idx < 4) 4268 pshufhw = false; 4269 if ((idx != i) && idx > 3) 4270 pshuflw = false; 4271 } 4272 V1 = NewV; 4273 V2Used = false; 4274 BestLoQuad = 0; 4275 BestHiQuad = 1; 4276 } 4277 4278 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4279 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4280 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4281 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4282 unsigned TargetMask = 0; 4283 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4284 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4285 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4286 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4287 V1 = NewV.getOperand(0); 4288 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, V1, TargetMask, DAG); 4289 } 4290 } 4291 4292 // If we have SSSE3, and all words of the result are from 1 input vector, 4293 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4294 // is present, fall back to case 4. 4295 if (Subtarget->hasSSSE3()) { 4296 SmallVector<SDValue,16> pshufbMask; 4297 4298 // If we have elements from both input vectors, set the high bit of the 4299 // shuffle mask element to zero out elements that come from V2 in the V1 4300 // mask, and elements that come from V1 in the V2 mask, so that the two 4301 // results can be OR'd together. 4302 bool TwoInputs = V1Used && V2Used; 4303 for (unsigned i = 0; i != 8; ++i) { 4304 int EltIdx = MaskVals[i] * 2; 4305 if (TwoInputs && (EltIdx >= 16)) { 4306 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4307 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4308 continue; 4309 } 4310 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4311 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4312 } 4313 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4314 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4315 DAG.getNode(ISD::BUILD_VECTOR, dl, 4316 MVT::v16i8, &pshufbMask[0], 16)); 4317 if (!TwoInputs) 4318 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4319 4320 // Calculate the shuffle mask for the second input, shuffle it, and 4321 // OR it with the first shuffled input. 4322 pshufbMask.clear(); 4323 for (unsigned i = 0; i != 8; ++i) { 4324 int EltIdx = MaskVals[i] * 2; 4325 if (EltIdx < 16) { 4326 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4327 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4328 continue; 4329 } 4330 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4331 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4332 } 4333 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4334 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4335 DAG.getNode(ISD::BUILD_VECTOR, dl, 4336 MVT::v16i8, &pshufbMask[0], 16)); 4337 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4338 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4339 } 4340 4341 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4342 // and update MaskVals with new element order. 4343 BitVector InOrder(8); 4344 if (BestLoQuad >= 0) { 4345 SmallVector<int, 8> MaskV; 4346 for (int i = 0; i != 4; ++i) { 4347 int idx = MaskVals[i]; 4348 if (idx < 0) { 4349 MaskV.push_back(-1); 4350 InOrder.set(i); 4351 } else if ((idx / 4) == BestLoQuad) { 4352 MaskV.push_back(idx & 3); 4353 InOrder.set(i); 4354 } else { 4355 MaskV.push_back(-1); 4356 } 4357 } 4358 for (unsigned i = 4; i != 8; ++i) 4359 MaskV.push_back(i); 4360 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4361 &MaskV[0]); 4362 } 4363 4364 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4365 // and update MaskVals with the new element order. 4366 if (BestHiQuad >= 0) { 4367 SmallVector<int, 8> MaskV; 4368 for (unsigned i = 0; i != 4; ++i) 4369 MaskV.push_back(i); 4370 for (unsigned i = 4; i != 8; ++i) { 4371 int idx = MaskVals[i]; 4372 if (idx < 0) { 4373 MaskV.push_back(-1); 4374 InOrder.set(i); 4375 } else if ((idx / 4) == BestHiQuad) { 4376 MaskV.push_back((idx & 3) + 4); 4377 InOrder.set(i); 4378 } else { 4379 MaskV.push_back(-1); 4380 } 4381 } 4382 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4383 &MaskV[0]); 4384 } 4385 4386 // In case BestHi & BestLo were both -1, which means each quadword has a word 4387 // from each of the four input quadwords, calculate the InOrder bitvector now 4388 // before falling through to the insert/extract cleanup. 4389 if (BestLoQuad == -1 && BestHiQuad == -1) { 4390 NewV = V1; 4391 for (int i = 0; i != 8; ++i) 4392 if (MaskVals[i] < 0 || MaskVals[i] == i) 4393 InOrder.set(i); 4394 } 4395 4396 // The other elements are put in the right place using pextrw and pinsrw. 4397 for (unsigned i = 0; i != 8; ++i) { 4398 if (InOrder[i]) 4399 continue; 4400 int EltIdx = MaskVals[i]; 4401 if (EltIdx < 0) 4402 continue; 4403 SDValue ExtOp = (EltIdx < 8) 4404 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4405 DAG.getIntPtrConstant(EltIdx)) 4406 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4407 DAG.getIntPtrConstant(EltIdx - 8)); 4408 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4409 DAG.getIntPtrConstant(i)); 4410 } 4411 return NewV; 4412} 4413 4414// v16i8 shuffles - Prefer shuffles in the following order: 4415// 1. [ssse3] 1 x pshufb 4416// 2. [ssse3] 2 x pshufb + 1 x por 4417// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4418static 4419SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4420 SelectionDAG &DAG, 4421 const X86TargetLowering &TLI) { 4422 SDValue V1 = SVOp->getOperand(0); 4423 SDValue V2 = SVOp->getOperand(1); 4424 DebugLoc dl = SVOp->getDebugLoc(); 4425 SmallVector<int, 16> MaskVals; 4426 SVOp->getMask(MaskVals); 4427 4428 // If we have SSSE3, case 1 is generated when all result bytes come from 4429 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4430 // present, fall back to case 3. 4431 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4432 bool V1Only = true; 4433 bool V2Only = true; 4434 for (unsigned i = 0; i < 16; ++i) { 4435 int EltIdx = MaskVals[i]; 4436 if (EltIdx < 0) 4437 continue; 4438 if (EltIdx < 16) 4439 V2Only = false; 4440 else 4441 V1Only = false; 4442 } 4443 4444 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4445 if (TLI.getSubtarget()->hasSSSE3()) { 4446 SmallVector<SDValue,16> pshufbMask; 4447 4448 // If all result elements are from one input vector, then only translate 4449 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4450 // 4451 // Otherwise, we have elements from both input vectors, and must zero out 4452 // elements that come from V2 in the first mask, and V1 in the second mask 4453 // so that we can OR them together. 4454 bool TwoInputs = !(V1Only || V2Only); 4455 for (unsigned i = 0; i != 16; ++i) { 4456 int EltIdx = MaskVals[i]; 4457 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4458 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4459 continue; 4460 } 4461 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4462 } 4463 // If all the elements are from V2, assign it to V1 and return after 4464 // building the first pshufb. 4465 if (V2Only) 4466 V1 = V2; 4467 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4468 DAG.getNode(ISD::BUILD_VECTOR, dl, 4469 MVT::v16i8, &pshufbMask[0], 16)); 4470 if (!TwoInputs) 4471 return V1; 4472 4473 // Calculate the shuffle mask for the second input, shuffle it, and 4474 // OR it with the first shuffled input. 4475 pshufbMask.clear(); 4476 for (unsigned i = 0; i != 16; ++i) { 4477 int EltIdx = MaskVals[i]; 4478 if (EltIdx < 16) { 4479 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4480 continue; 4481 } 4482 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4483 } 4484 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4485 DAG.getNode(ISD::BUILD_VECTOR, dl, 4486 MVT::v16i8, &pshufbMask[0], 16)); 4487 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4488 } 4489 4490 // No SSSE3 - Calculate in place words and then fix all out of place words 4491 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4492 // the 16 different words that comprise the two doublequadword input vectors. 4493 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4494 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4495 SDValue NewV = V2Only ? V2 : V1; 4496 for (int i = 0; i != 8; ++i) { 4497 int Elt0 = MaskVals[i*2]; 4498 int Elt1 = MaskVals[i*2+1]; 4499 4500 // This word of the result is all undef, skip it. 4501 if (Elt0 < 0 && Elt1 < 0) 4502 continue; 4503 4504 // This word of the result is already in the correct place, skip it. 4505 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4506 continue; 4507 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4508 continue; 4509 4510 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4511 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4512 SDValue InsElt; 4513 4514 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4515 // using a single extract together, load it and store it. 4516 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4517 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4518 DAG.getIntPtrConstant(Elt1 / 2)); 4519 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4520 DAG.getIntPtrConstant(i)); 4521 continue; 4522 } 4523 4524 // If Elt1 is defined, extract it from the appropriate source. If the 4525 // source byte is not also odd, shift the extracted word left 8 bits 4526 // otherwise clear the bottom 8 bits if we need to do an or. 4527 if (Elt1 >= 0) { 4528 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4529 DAG.getIntPtrConstant(Elt1 / 2)); 4530 if ((Elt1 & 1) == 0) 4531 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4532 DAG.getConstant(8, TLI.getShiftAmountTy())); 4533 else if (Elt0 >= 0) 4534 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4535 DAG.getConstant(0xFF00, MVT::i16)); 4536 } 4537 // If Elt0 is defined, extract it from the appropriate source. If the 4538 // source byte is not also even, shift the extracted word right 8 bits. If 4539 // Elt1 was also defined, OR the extracted values together before 4540 // inserting them in the result. 4541 if (Elt0 >= 0) { 4542 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4543 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4544 if ((Elt0 & 1) != 0) 4545 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4546 DAG.getConstant(8, TLI.getShiftAmountTy())); 4547 else if (Elt1 >= 0) 4548 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4549 DAG.getConstant(0x00FF, MVT::i16)); 4550 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4551 : InsElt0; 4552 } 4553 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4554 DAG.getIntPtrConstant(i)); 4555 } 4556 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4557} 4558 4559/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4560/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be 4561/// done when every pair / quad of shuffle mask elements point to elements in 4562/// the right sequence. e.g. 4563/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4564static 4565SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4566 SelectionDAG &DAG, 4567 const TargetLowering &TLI, DebugLoc dl) { 4568 EVT VT = SVOp->getValueType(0); 4569 SDValue V1 = SVOp->getOperand(0); 4570 SDValue V2 = SVOp->getOperand(1); 4571 unsigned NumElems = VT.getVectorNumElements(); 4572 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4573 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4574 EVT NewVT = MaskVT; 4575 switch (VT.getSimpleVT().SimpleTy) { 4576 default: assert(false && "Unexpected!"); 4577 case MVT::v4f32: NewVT = MVT::v2f64; break; 4578 case MVT::v4i32: NewVT = MVT::v2i64; break; 4579 case MVT::v8i16: NewVT = MVT::v4i32; break; 4580 case MVT::v16i8: NewVT = MVT::v4i32; break; 4581 } 4582 4583 if (NewWidth == 2) { 4584 if (VT.isInteger()) 4585 NewVT = MVT::v2i64; 4586 else 4587 NewVT = MVT::v2f64; 4588 } 4589 int Scale = NumElems / NewWidth; 4590 SmallVector<int, 8> MaskVec; 4591 for (unsigned i = 0; i < NumElems; i += Scale) { 4592 int StartIdx = -1; 4593 for (int j = 0; j < Scale; ++j) { 4594 int EltIdx = SVOp->getMaskElt(i+j); 4595 if (EltIdx < 0) 4596 continue; 4597 if (StartIdx == -1) 4598 StartIdx = EltIdx - (EltIdx % Scale); 4599 if (EltIdx != StartIdx + j) 4600 return SDValue(); 4601 } 4602 if (StartIdx == -1) 4603 MaskVec.push_back(-1); 4604 else 4605 MaskVec.push_back(StartIdx / Scale); 4606 } 4607 4608 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4609 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4610 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4611} 4612 4613/// getVZextMovL - Return a zero-extending vector move low node. 4614/// 4615static SDValue getVZextMovL(EVT VT, EVT OpVT, 4616 SDValue SrcOp, SelectionDAG &DAG, 4617 const X86Subtarget *Subtarget, DebugLoc dl) { 4618 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4619 LoadSDNode *LD = NULL; 4620 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4621 LD = dyn_cast<LoadSDNode>(SrcOp); 4622 if (!LD) { 4623 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4624 // instead. 4625 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4626 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4627 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4628 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4629 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4630 // PR2108 4631 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4632 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4633 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4634 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4635 OpVT, 4636 SrcOp.getOperand(0) 4637 .getOperand(0)))); 4638 } 4639 } 4640 } 4641 4642 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4643 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4644 DAG.getNode(ISD::BIT_CONVERT, dl, 4645 OpVT, SrcOp))); 4646} 4647 4648/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4649/// shuffles. 4650static SDValue 4651LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4652 SDValue V1 = SVOp->getOperand(0); 4653 SDValue V2 = SVOp->getOperand(1); 4654 DebugLoc dl = SVOp->getDebugLoc(); 4655 EVT VT = SVOp->getValueType(0); 4656 4657 SmallVector<std::pair<int, int>, 8> Locs; 4658 Locs.resize(4); 4659 SmallVector<int, 8> Mask1(4U, -1); 4660 SmallVector<int, 8> PermMask; 4661 SVOp->getMask(PermMask); 4662 4663 unsigned NumHi = 0; 4664 unsigned NumLo = 0; 4665 for (unsigned i = 0; i != 4; ++i) { 4666 int Idx = PermMask[i]; 4667 if (Idx < 0) { 4668 Locs[i] = std::make_pair(-1, -1); 4669 } else { 4670 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4671 if (Idx < 4) { 4672 Locs[i] = std::make_pair(0, NumLo); 4673 Mask1[NumLo] = Idx; 4674 NumLo++; 4675 } else { 4676 Locs[i] = std::make_pair(1, NumHi); 4677 if (2+NumHi < 4) 4678 Mask1[2+NumHi] = Idx; 4679 NumHi++; 4680 } 4681 } 4682 } 4683 4684 if (NumLo <= 2 && NumHi <= 2) { 4685 // If no more than two elements come from either vector. This can be 4686 // implemented with two shuffles. First shuffle gather the elements. 4687 // The second shuffle, which takes the first shuffle as both of its 4688 // vector operands, put the elements into the right order. 4689 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4690 4691 SmallVector<int, 8> Mask2(4U, -1); 4692 4693 for (unsigned i = 0; i != 4; ++i) { 4694 if (Locs[i].first == -1) 4695 continue; 4696 else { 4697 unsigned Idx = (i < 2) ? 0 : 4; 4698 Idx += Locs[i].first * 2 + Locs[i].second; 4699 Mask2[i] = Idx; 4700 } 4701 } 4702 4703 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4704 } else if (NumLo == 3 || NumHi == 3) { 4705 // Otherwise, we must have three elements from one vector, call it X, and 4706 // one element from the other, call it Y. First, use a shufps to build an 4707 // intermediate vector with the one element from Y and the element from X 4708 // that will be in the same half in the final destination (the indexes don't 4709 // matter). Then, use a shufps to build the final vector, taking the half 4710 // containing the element from Y from the intermediate, and the other half 4711 // from X. 4712 if (NumHi == 3) { 4713 // Normalize it so the 3 elements come from V1. 4714 CommuteVectorShuffleMask(PermMask, VT); 4715 std::swap(V1, V2); 4716 } 4717 4718 // Find the element from V2. 4719 unsigned HiIndex; 4720 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4721 int Val = PermMask[HiIndex]; 4722 if (Val < 0) 4723 continue; 4724 if (Val >= 4) 4725 break; 4726 } 4727 4728 Mask1[0] = PermMask[HiIndex]; 4729 Mask1[1] = -1; 4730 Mask1[2] = PermMask[HiIndex^1]; 4731 Mask1[3] = -1; 4732 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4733 4734 if (HiIndex >= 2) { 4735 Mask1[0] = PermMask[0]; 4736 Mask1[1] = PermMask[1]; 4737 Mask1[2] = HiIndex & 1 ? 6 : 4; 4738 Mask1[3] = HiIndex & 1 ? 4 : 6; 4739 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4740 } else { 4741 Mask1[0] = HiIndex & 1 ? 2 : 0; 4742 Mask1[1] = HiIndex & 1 ? 0 : 2; 4743 Mask1[2] = PermMask[2]; 4744 Mask1[3] = PermMask[3]; 4745 if (Mask1[2] >= 0) 4746 Mask1[2] += 4; 4747 if (Mask1[3] >= 0) 4748 Mask1[3] += 4; 4749 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4750 } 4751 } 4752 4753 // Break it into (shuffle shuffle_hi, shuffle_lo). 4754 Locs.clear(); 4755 SmallVector<int,8> LoMask(4U, -1); 4756 SmallVector<int,8> HiMask(4U, -1); 4757 4758 SmallVector<int,8> *MaskPtr = &LoMask; 4759 unsigned MaskIdx = 0; 4760 unsigned LoIdx = 0; 4761 unsigned HiIdx = 2; 4762 for (unsigned i = 0; i != 4; ++i) { 4763 if (i == 2) { 4764 MaskPtr = &HiMask; 4765 MaskIdx = 1; 4766 LoIdx = 0; 4767 HiIdx = 2; 4768 } 4769 int Idx = PermMask[i]; 4770 if (Idx < 0) { 4771 Locs[i] = std::make_pair(-1, -1); 4772 } else if (Idx < 4) { 4773 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4774 (*MaskPtr)[LoIdx] = Idx; 4775 LoIdx++; 4776 } else { 4777 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4778 (*MaskPtr)[HiIdx] = Idx; 4779 HiIdx++; 4780 } 4781 } 4782 4783 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4784 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4785 SmallVector<int, 8> MaskOps; 4786 for (unsigned i = 0; i != 4; ++i) { 4787 if (Locs[i].first == -1) { 4788 MaskOps.push_back(-1); 4789 } else { 4790 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4791 MaskOps.push_back(Idx); 4792 } 4793 } 4794 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4795} 4796 4797SDValue 4798X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 4799 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4800 SDValue V1 = Op.getOperand(0); 4801 SDValue V2 = Op.getOperand(1); 4802 EVT VT = Op.getValueType(); 4803 DebugLoc dl = Op.getDebugLoc(); 4804 unsigned NumElems = VT.getVectorNumElements(); 4805 bool isMMX = VT.getSizeInBits() == 64; 4806 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4807 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4808 bool V1IsSplat = false; 4809 bool V2IsSplat = false; 4810 4811 if (isZeroShuffle(SVOp)) 4812 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4813 4814 // Promote splats to v4f32. 4815 if (SVOp->isSplat()) { 4816 if (isMMX || NumElems < 4) 4817 return Op; 4818 return PromoteSplat(SVOp, DAG); 4819 } 4820 4821 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4822 // do it! 4823 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4824 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4825 if (NewOp.getNode()) 4826 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4827 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4828 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4829 // FIXME: Figure out a cleaner way to do this. 4830 // Try to make use of movq to zero out the top part. 4831 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4832 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4833 if (NewOp.getNode()) { 4834 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4835 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4836 DAG, Subtarget, dl); 4837 } 4838 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4839 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4840 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4841 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4842 DAG, Subtarget, dl); 4843 } 4844 } 4845 4846 if (X86::isPSHUFDMask(SVOp)) 4847 return Op; 4848 4849 // Check if this can be converted into a logical shift. 4850 bool isLeft = false; 4851 unsigned ShAmt = 0; 4852 SDValue ShVal; 4853 bool isShift = getSubtarget()->hasSSE2() && 4854 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4855 if (isShift && ShVal.hasOneUse()) { 4856 // If the shifted value has multiple uses, it may be cheaper to use 4857 // v_set0 + movlhps or movhlps, etc. 4858 EVT EltVT = VT.getVectorElementType(); 4859 ShAmt *= EltVT.getSizeInBits(); 4860 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4861 } 4862 4863 if (X86::isMOVLMask(SVOp)) { 4864 if (V1IsUndef) 4865 return V2; 4866 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4867 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4868 if (!isMMX) 4869 return Op; 4870 } 4871 4872 // FIXME: fold these into legal mask. 4873 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4874 X86::isMOVSLDUPMask(SVOp) || 4875 X86::isMOVHLPSMask(SVOp) || 4876 X86::isMOVLHPSMask(SVOp) || 4877 X86::isMOVLPMask(SVOp))) 4878 return Op; 4879 4880 if (ShouldXformToMOVHLPS(SVOp) || 4881 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4882 return CommuteVectorShuffle(SVOp, DAG); 4883 4884 if (isShift) { 4885 // No better options. Use a vshl / vsrl. 4886 EVT EltVT = VT.getVectorElementType(); 4887 ShAmt *= EltVT.getSizeInBits(); 4888 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4889 } 4890 4891 bool Commuted = false; 4892 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4893 // 1,1,1,1 -> v8i16 though. 4894 V1IsSplat = isSplatVector(V1.getNode()); 4895 V2IsSplat = isSplatVector(V2.getNode()); 4896 4897 // Canonicalize the splat or undef, if present, to be on the RHS. 4898 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4899 Op = CommuteVectorShuffle(SVOp, DAG); 4900 SVOp = cast<ShuffleVectorSDNode>(Op); 4901 V1 = SVOp->getOperand(0); 4902 V2 = SVOp->getOperand(1); 4903 std::swap(V1IsSplat, V2IsSplat); 4904 std::swap(V1IsUndef, V2IsUndef); 4905 Commuted = true; 4906 } 4907 4908 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4909 // Shuffling low element of v1 into undef, just return v1. 4910 if (V2IsUndef) 4911 return V1; 4912 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4913 // the instruction selector will not match, so get a canonical MOVL with 4914 // swapped operands to undo the commute. 4915 return getMOVL(DAG, dl, VT, V2, V1); 4916 } 4917 4918 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4919 X86::isUNPCKH_v_undef_Mask(SVOp) || 4920 X86::isUNPCKLMask(SVOp) || 4921 X86::isUNPCKHMask(SVOp)) 4922 return Op; 4923 4924 if (V2IsSplat) { 4925 // Normalize mask so all entries that point to V2 points to its first 4926 // element then try to match unpck{h|l} again. If match, return a 4927 // new vector_shuffle with the corrected mask. 4928 SDValue NewMask = NormalizeMask(SVOp, DAG); 4929 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4930 if (NSVOp != SVOp) { 4931 if (X86::isUNPCKLMask(NSVOp, true)) { 4932 return NewMask; 4933 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4934 return NewMask; 4935 } 4936 } 4937 } 4938 4939 if (Commuted) { 4940 // Commute is back and try unpck* again. 4941 // FIXME: this seems wrong. 4942 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4943 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4944 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4945 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4946 X86::isUNPCKLMask(NewSVOp) || 4947 X86::isUNPCKHMask(NewSVOp)) 4948 return NewOp; 4949 } 4950 4951 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4952 4953 // Normalize the node to match x86 shuffle ops if needed 4954 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4955 return CommuteVectorShuffle(SVOp, DAG); 4956 4957 // Check for legal shuffle and return? 4958 SmallVector<int, 16> PermMask; 4959 SVOp->getMask(PermMask); 4960 if (isShuffleMaskLegal(PermMask, VT)) 4961 return Op; 4962 4963 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4964 if (VT == MVT::v8i16) { 4965 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 4966 if (NewOp.getNode()) 4967 return NewOp; 4968 } 4969 4970 if (VT == MVT::v16i8) { 4971 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4972 if (NewOp.getNode()) 4973 return NewOp; 4974 } 4975 4976 // Handle all 4 wide cases with a number of shuffles except for MMX. 4977 if (NumElems == 4 && !isMMX) 4978 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4979 4980 return SDValue(); 4981} 4982 4983SDValue 4984X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4985 SelectionDAG &DAG) const { 4986 EVT VT = Op.getValueType(); 4987 DebugLoc dl = Op.getDebugLoc(); 4988 if (VT.getSizeInBits() == 8) { 4989 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4990 Op.getOperand(0), Op.getOperand(1)); 4991 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4992 DAG.getValueType(VT)); 4993 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4994 } else if (VT.getSizeInBits() == 16) { 4995 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4996 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4997 if (Idx == 0) 4998 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4999 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5000 DAG.getNode(ISD::BIT_CONVERT, dl, 5001 MVT::v4i32, 5002 Op.getOperand(0)), 5003 Op.getOperand(1))); 5004 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5005 Op.getOperand(0), Op.getOperand(1)); 5006 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5007 DAG.getValueType(VT)); 5008 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5009 } else if (VT == MVT::f32) { 5010 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5011 // the result back to FR32 register. It's only worth matching if the 5012 // result has a single use which is a store or a bitcast to i32. And in 5013 // the case of a store, it's not worth it if the index is a constant 0, 5014 // because a MOVSSmr can be used instead, which is smaller and faster. 5015 if (!Op.hasOneUse()) 5016 return SDValue(); 5017 SDNode *User = *Op.getNode()->use_begin(); 5018 if ((User->getOpcode() != ISD::STORE || 5019 (isa<ConstantSDNode>(Op.getOperand(1)) && 5020 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5021 (User->getOpcode() != ISD::BIT_CONVERT || 5022 User->getValueType(0) != MVT::i32)) 5023 return SDValue(); 5024 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5025 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 5026 Op.getOperand(0)), 5027 Op.getOperand(1)); 5028 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 5029 } else if (VT == MVT::i32) { 5030 // ExtractPS works with constant index. 5031 if (isa<ConstantSDNode>(Op.getOperand(1))) 5032 return Op; 5033 } 5034 return SDValue(); 5035} 5036 5037 5038SDValue 5039X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5040 SelectionDAG &DAG) const { 5041 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5042 return SDValue(); 5043 5044 if (Subtarget->hasSSE41()) { 5045 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5046 if (Res.getNode()) 5047 return Res; 5048 } 5049 5050 EVT VT = Op.getValueType(); 5051 DebugLoc dl = Op.getDebugLoc(); 5052 // TODO: handle v16i8. 5053 if (VT.getSizeInBits() == 16) { 5054 SDValue Vec = Op.getOperand(0); 5055 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5056 if (Idx == 0) 5057 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5058 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5059 DAG.getNode(ISD::BIT_CONVERT, dl, 5060 MVT::v4i32, Vec), 5061 Op.getOperand(1))); 5062 // Transform it so it match pextrw which produces a 32-bit result. 5063 EVT EltVT = MVT::i32; 5064 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5065 Op.getOperand(0), Op.getOperand(1)); 5066 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5067 DAG.getValueType(VT)); 5068 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5069 } else if (VT.getSizeInBits() == 32) { 5070 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5071 if (Idx == 0) 5072 return Op; 5073 5074 // SHUFPS the element to the lowest double word, then movss. 5075 int Mask[4] = { Idx, -1, -1, -1 }; 5076 EVT VVT = Op.getOperand(0).getValueType(); 5077 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5078 DAG.getUNDEF(VVT), Mask); 5079 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5080 DAG.getIntPtrConstant(0)); 5081 } else if (VT.getSizeInBits() == 64) { 5082 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5083 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5084 // to match extract_elt for f64. 5085 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5086 if (Idx == 0) 5087 return Op; 5088 5089 // UNPCKHPD the element to the lowest double word, then movsd. 5090 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5091 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5092 int Mask[2] = { 1, -1 }; 5093 EVT VVT = Op.getOperand(0).getValueType(); 5094 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5095 DAG.getUNDEF(VVT), Mask); 5096 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5097 DAG.getIntPtrConstant(0)); 5098 } 5099 5100 return SDValue(); 5101} 5102 5103SDValue 5104X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5105 SelectionDAG &DAG) const { 5106 EVT VT = Op.getValueType(); 5107 EVT EltVT = VT.getVectorElementType(); 5108 DebugLoc dl = Op.getDebugLoc(); 5109 5110 SDValue N0 = Op.getOperand(0); 5111 SDValue N1 = Op.getOperand(1); 5112 SDValue N2 = Op.getOperand(2); 5113 5114 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5115 isa<ConstantSDNode>(N2)) { 5116 unsigned Opc; 5117 if (VT == MVT::v8i16) 5118 Opc = X86ISD::PINSRW; 5119 else if (VT == MVT::v4i16) 5120 Opc = X86ISD::MMX_PINSRW; 5121 else if (VT == MVT::v16i8) 5122 Opc = X86ISD::PINSRB; 5123 else 5124 Opc = X86ISD::PINSRB; 5125 5126 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5127 // argument. 5128 if (N1.getValueType() != MVT::i32) 5129 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5130 if (N2.getValueType() != MVT::i32) 5131 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5132 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5133 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5134 // Bits [7:6] of the constant are the source select. This will always be 5135 // zero here. The DAG Combiner may combine an extract_elt index into these 5136 // bits. For example (insert (extract, 3), 2) could be matched by putting 5137 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5138 // Bits [5:4] of the constant are the destination select. This is the 5139 // value of the incoming immediate. 5140 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5141 // combine either bitwise AND or insert of float 0.0 to set these bits. 5142 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5143 // Create this as a scalar to vector.. 5144 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5145 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5146 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5147 // PINSR* works with constant index. 5148 return Op; 5149 } 5150 return SDValue(); 5151} 5152 5153SDValue 5154X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5155 EVT VT = Op.getValueType(); 5156 EVT EltVT = VT.getVectorElementType(); 5157 5158 if (Subtarget->hasSSE41()) 5159 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5160 5161 if (EltVT == MVT::i8) 5162 return SDValue(); 5163 5164 DebugLoc dl = Op.getDebugLoc(); 5165 SDValue N0 = Op.getOperand(0); 5166 SDValue N1 = Op.getOperand(1); 5167 SDValue N2 = Op.getOperand(2); 5168 5169 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5170 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5171 // as its second argument. 5172 if (N1.getValueType() != MVT::i32) 5173 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5174 if (N2.getValueType() != MVT::i32) 5175 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5176 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5177 dl, VT, N0, N1, N2); 5178 } 5179 return SDValue(); 5180} 5181 5182SDValue 5183X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5184 DebugLoc dl = Op.getDebugLoc(); 5185 5186 if (Op.getValueType() == MVT::v1i64 && 5187 Op.getOperand(0).getValueType() == MVT::i64) 5188 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5189 5190 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5191 EVT VT = MVT::v2i32; 5192 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5193 default: break; 5194 case MVT::v16i8: 5195 case MVT::v8i16: 5196 VT = MVT::v4i32; 5197 break; 5198 } 5199 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5200 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5201} 5202 5203// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5204// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5205// one of the above mentioned nodes. It has to be wrapped because otherwise 5206// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5207// be used to form addressing mode. These wrapped nodes will be selected 5208// into MOV32ri. 5209SDValue 5210X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5211 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5212 5213 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5214 // global base reg. 5215 unsigned char OpFlag = 0; 5216 unsigned WrapperKind = X86ISD::Wrapper; 5217 CodeModel::Model M = getTargetMachine().getCodeModel(); 5218 5219 if (Subtarget->isPICStyleRIPRel() && 5220 (M == CodeModel::Small || M == CodeModel::Kernel)) 5221 WrapperKind = X86ISD::WrapperRIP; 5222 else if (Subtarget->isPICStyleGOT()) 5223 OpFlag = X86II::MO_GOTOFF; 5224 else if (Subtarget->isPICStyleStubPIC()) 5225 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5226 5227 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5228 CP->getAlignment(), 5229 CP->getOffset(), OpFlag); 5230 DebugLoc DL = CP->getDebugLoc(); 5231 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5232 // With PIC, the address is actually $g + Offset. 5233 if (OpFlag) { 5234 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5235 DAG.getNode(X86ISD::GlobalBaseReg, 5236 DebugLoc(), getPointerTy()), 5237 Result); 5238 } 5239 5240 return Result; 5241} 5242 5243SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5244 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5245 5246 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5247 // global base reg. 5248 unsigned char OpFlag = 0; 5249 unsigned WrapperKind = X86ISD::Wrapper; 5250 CodeModel::Model M = getTargetMachine().getCodeModel(); 5251 5252 if (Subtarget->isPICStyleRIPRel() && 5253 (M == CodeModel::Small || M == CodeModel::Kernel)) 5254 WrapperKind = X86ISD::WrapperRIP; 5255 else if (Subtarget->isPICStyleGOT()) 5256 OpFlag = X86II::MO_GOTOFF; 5257 else if (Subtarget->isPICStyleStubPIC()) 5258 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5259 5260 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5261 OpFlag); 5262 DebugLoc DL = JT->getDebugLoc(); 5263 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5264 5265 // With PIC, the address is actually $g + Offset. 5266 if (OpFlag) { 5267 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5268 DAG.getNode(X86ISD::GlobalBaseReg, 5269 DebugLoc(), getPointerTy()), 5270 Result); 5271 } 5272 5273 return Result; 5274} 5275 5276SDValue 5277X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5278 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5279 5280 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5281 // global base reg. 5282 unsigned char OpFlag = 0; 5283 unsigned WrapperKind = X86ISD::Wrapper; 5284 CodeModel::Model M = getTargetMachine().getCodeModel(); 5285 5286 if (Subtarget->isPICStyleRIPRel() && 5287 (M == CodeModel::Small || M == CodeModel::Kernel)) 5288 WrapperKind = X86ISD::WrapperRIP; 5289 else if (Subtarget->isPICStyleGOT()) 5290 OpFlag = X86II::MO_GOTOFF; 5291 else if (Subtarget->isPICStyleStubPIC()) 5292 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5293 5294 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5295 5296 DebugLoc DL = Op.getDebugLoc(); 5297 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5298 5299 5300 // With PIC, the address is actually $g + Offset. 5301 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5302 !Subtarget->is64Bit()) { 5303 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5304 DAG.getNode(X86ISD::GlobalBaseReg, 5305 DebugLoc(), getPointerTy()), 5306 Result); 5307 } 5308 5309 return Result; 5310} 5311 5312SDValue 5313X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5314 // Create the TargetBlockAddressAddress node. 5315 unsigned char OpFlags = 5316 Subtarget->ClassifyBlockAddressReference(); 5317 CodeModel::Model M = getTargetMachine().getCodeModel(); 5318 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5319 DebugLoc dl = Op.getDebugLoc(); 5320 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5321 /*isTarget=*/true, OpFlags); 5322 5323 if (Subtarget->isPICStyleRIPRel() && 5324 (M == CodeModel::Small || M == CodeModel::Kernel)) 5325 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5326 else 5327 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5328 5329 // With PIC, the address is actually $g + Offset. 5330 if (isGlobalRelativeToPICBase(OpFlags)) { 5331 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5332 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5333 Result); 5334 } 5335 5336 return Result; 5337} 5338 5339SDValue 5340X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5341 int64_t Offset, 5342 SelectionDAG &DAG) const { 5343 // Create the TargetGlobalAddress node, folding in the constant 5344 // offset if it is legal. 5345 unsigned char OpFlags = 5346 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5347 CodeModel::Model M = getTargetMachine().getCodeModel(); 5348 SDValue Result; 5349 if (OpFlags == X86II::MO_NO_FLAG && 5350 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5351 // A direct static reference to a global. 5352 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 5353 Offset = 0; 5354 } else { 5355 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 5356 } 5357 5358 if (Subtarget->isPICStyleRIPRel() && 5359 (M == CodeModel::Small || M == CodeModel::Kernel)) 5360 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5361 else 5362 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5363 5364 // With PIC, the address is actually $g + Offset. 5365 if (isGlobalRelativeToPICBase(OpFlags)) { 5366 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5367 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5368 Result); 5369 } 5370 5371 // For globals that require a load from a stub to get the address, emit the 5372 // load. 5373 if (isGlobalStubReference(OpFlags)) 5374 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5375 PseudoSourceValue::getGOT(), 0, false, false, 0); 5376 5377 // If there was a non-zero offset that we didn't fold, create an explicit 5378 // addition for it. 5379 if (Offset != 0) 5380 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5381 DAG.getConstant(Offset, getPointerTy())); 5382 5383 return Result; 5384} 5385 5386SDValue 5387X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 5388 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5389 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5390 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5391} 5392 5393static SDValue 5394GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5395 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5396 unsigned char OperandFlags) { 5397 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5398 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5399 DebugLoc dl = GA->getDebugLoc(); 5400 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5401 GA->getValueType(0), 5402 GA->getOffset(), 5403 OperandFlags); 5404 if (InFlag) { 5405 SDValue Ops[] = { Chain, TGA, *InFlag }; 5406 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5407 } else { 5408 SDValue Ops[] = { Chain, TGA }; 5409 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5410 } 5411 5412 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5413 MFI->setAdjustsStack(true); 5414 5415 SDValue Flag = Chain.getValue(1); 5416 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5417} 5418 5419// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5420static SDValue 5421LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5422 const EVT PtrVT) { 5423 SDValue InFlag; 5424 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5425 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5426 DAG.getNode(X86ISD::GlobalBaseReg, 5427 DebugLoc(), PtrVT), InFlag); 5428 InFlag = Chain.getValue(1); 5429 5430 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5431} 5432 5433// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5434static SDValue 5435LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5436 const EVT PtrVT) { 5437 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5438 X86::RAX, X86II::MO_TLSGD); 5439} 5440 5441// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5442// "local exec" model. 5443static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5444 const EVT PtrVT, TLSModel::Model model, 5445 bool is64Bit) { 5446 DebugLoc dl = GA->getDebugLoc(); 5447 // Get the Thread Pointer 5448 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5449 DebugLoc(), PtrVT, 5450 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5451 MVT::i32)); 5452 5453 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5454 NULL, 0, false, false, 0); 5455 5456 unsigned char OperandFlags = 0; 5457 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5458 // initialexec. 5459 unsigned WrapperKind = X86ISD::Wrapper; 5460 if (model == TLSModel::LocalExec) { 5461 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5462 } else if (is64Bit) { 5463 assert(model == TLSModel::InitialExec); 5464 OperandFlags = X86II::MO_GOTTPOFF; 5465 WrapperKind = X86ISD::WrapperRIP; 5466 } else { 5467 assert(model == TLSModel::InitialExec); 5468 OperandFlags = X86II::MO_INDNTPOFF; 5469 } 5470 5471 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5472 // exec) 5473 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5474 GA->getValueType(0), 5475 GA->getOffset(), OperandFlags); 5476 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5477 5478 if (model == TLSModel::InitialExec) 5479 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5480 PseudoSourceValue::getGOT(), 0, false, false, 0); 5481 5482 // The address of the thread local variable is the add of the thread 5483 // pointer with the offset of the variable. 5484 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5485} 5486 5487SDValue 5488X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 5489 5490 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5491 const GlobalValue *GV = GA->getGlobal(); 5492 5493 if (Subtarget->isTargetELF()) { 5494 // TODO: implement the "local dynamic" model 5495 // TODO: implement the "initial exec"model for pic executables 5496 5497 // If GV is an alias then use the aliasee for determining 5498 // thread-localness. 5499 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5500 GV = GA->resolveAliasedGlobal(false); 5501 5502 TLSModel::Model model 5503 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 5504 5505 switch (model) { 5506 case TLSModel::GeneralDynamic: 5507 case TLSModel::LocalDynamic: // not implemented 5508 if (Subtarget->is64Bit()) 5509 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5510 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5511 5512 case TLSModel::InitialExec: 5513 case TLSModel::LocalExec: 5514 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5515 Subtarget->is64Bit()); 5516 } 5517 } else if (Subtarget->isTargetDarwin()) { 5518 // Darwin only has one model of TLS. Lower to that. 5519 unsigned char OpFlag = 0; 5520 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 5521 X86ISD::WrapperRIP : X86ISD::Wrapper; 5522 5523 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5524 // global base reg. 5525 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 5526 !Subtarget->is64Bit(); 5527 if (PIC32) 5528 OpFlag = X86II::MO_TLVP_PIC_BASE; 5529 else 5530 OpFlag = X86II::MO_TLVP; 5531 DebugLoc DL = Op.getDebugLoc(); 5532 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 5533 getPointerTy(), 5534 GA->getOffset(), OpFlag); 5535 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5536 5537 // With PIC32, the address is actually $g + Offset. 5538 if (PIC32) 5539 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5540 DAG.getNode(X86ISD::GlobalBaseReg, 5541 DebugLoc(), getPointerTy()), 5542 Offset); 5543 5544 // Lowering the machine isd will make sure everything is in the right 5545 // location. 5546 SDValue Args[] = { Offset }; 5547 SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); 5548 5549 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 5550 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5551 MFI->setAdjustsStack(true); 5552 5553 // And our return value (tls address) is in the standard call return value 5554 // location. 5555 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 5556 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 5557 } 5558 5559 assert(false && 5560 "TLS not implemented for this target."); 5561 5562 llvm_unreachable("Unreachable"); 5563 return SDValue(); 5564} 5565 5566 5567/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5568/// take a 2 x i32 value to shift plus a shift amount. 5569SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 5570 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5571 EVT VT = Op.getValueType(); 5572 unsigned VTBits = VT.getSizeInBits(); 5573 DebugLoc dl = Op.getDebugLoc(); 5574 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5575 SDValue ShOpLo = Op.getOperand(0); 5576 SDValue ShOpHi = Op.getOperand(1); 5577 SDValue ShAmt = Op.getOperand(2); 5578 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5579 DAG.getConstant(VTBits - 1, MVT::i8)) 5580 : DAG.getConstant(0, VT); 5581 5582 SDValue Tmp2, Tmp3; 5583 if (Op.getOpcode() == ISD::SHL_PARTS) { 5584 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5585 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5586 } else { 5587 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5588 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5589 } 5590 5591 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5592 DAG.getConstant(VTBits, MVT::i8)); 5593 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5594 AndNode, DAG.getConstant(0, MVT::i8)); 5595 5596 SDValue Hi, Lo; 5597 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5598 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5599 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5600 5601 if (Op.getOpcode() == ISD::SHL_PARTS) { 5602 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5603 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5604 } else { 5605 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5606 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5607 } 5608 5609 SDValue Ops[2] = { Lo, Hi }; 5610 return DAG.getMergeValues(Ops, 2, dl); 5611} 5612 5613SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 5614 SelectionDAG &DAG) const { 5615 EVT SrcVT = Op.getOperand(0).getValueType(); 5616 5617 if (SrcVT.isVector()) { 5618 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5619 return Op; 5620 } 5621 return SDValue(); 5622 } 5623 5624 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5625 "Unknown SINT_TO_FP to lower!"); 5626 5627 // These are really Legal; return the operand so the caller accepts it as 5628 // Legal. 5629 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5630 return Op; 5631 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5632 Subtarget->is64Bit()) { 5633 return Op; 5634 } 5635 5636 DebugLoc dl = Op.getDebugLoc(); 5637 unsigned Size = SrcVT.getSizeInBits()/8; 5638 MachineFunction &MF = DAG.getMachineFunction(); 5639 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5640 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5641 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5642 StackSlot, 5643 PseudoSourceValue::getFixedStack(SSFI), 0, 5644 false, false, 0); 5645 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5646} 5647 5648SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5649 SDValue StackSlot, 5650 SelectionDAG &DAG) const { 5651 // Build the FILD 5652 DebugLoc dl = Op.getDebugLoc(); 5653 SDVTList Tys; 5654 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5655 if (useSSE) 5656 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5657 else 5658 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5659 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5660 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5661 Tys, Ops, array_lengthof(Ops)); 5662 5663 if (useSSE) { 5664 Chain = Result.getValue(1); 5665 SDValue InFlag = Result.getValue(2); 5666 5667 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5668 // shouldn't be necessary except that RFP cannot be live across 5669 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5670 MachineFunction &MF = DAG.getMachineFunction(); 5671 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5672 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5673 Tys = DAG.getVTList(MVT::Other); 5674 SDValue Ops[] = { 5675 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5676 }; 5677 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5678 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5679 PseudoSourceValue::getFixedStack(SSFI), 0, 5680 false, false, 0); 5681 } 5682 5683 return Result; 5684} 5685 5686// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5687SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 5688 SelectionDAG &DAG) const { 5689 // This algorithm is not obvious. Here it is in C code, more or less: 5690 /* 5691 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5692 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5693 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5694 5695 // Copy ints to xmm registers. 5696 __m128i xh = _mm_cvtsi32_si128( hi ); 5697 __m128i xl = _mm_cvtsi32_si128( lo ); 5698 5699 // Combine into low half of a single xmm register. 5700 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5701 __m128d d; 5702 double sd; 5703 5704 // Merge in appropriate exponents to give the integer bits the right 5705 // magnitude. 5706 x = _mm_unpacklo_epi32( x, exp ); 5707 5708 // Subtract away the biases to deal with the IEEE-754 double precision 5709 // implicit 1. 5710 d = _mm_sub_pd( (__m128d) x, bias ); 5711 5712 // All conversions up to here are exact. The correctly rounded result is 5713 // calculated using the current rounding mode using the following 5714 // horizontal add. 5715 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5716 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5717 // store doesn't really need to be here (except 5718 // maybe to zero the other double) 5719 return sd; 5720 } 5721 */ 5722 5723 DebugLoc dl = Op.getDebugLoc(); 5724 LLVMContext *Context = DAG.getContext(); 5725 5726 // Build some magic constants. 5727 std::vector<Constant*> CV0; 5728 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5729 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5730 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5731 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5732 Constant *C0 = ConstantVector::get(CV0); 5733 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5734 5735 std::vector<Constant*> CV1; 5736 CV1.push_back( 5737 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5738 CV1.push_back( 5739 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5740 Constant *C1 = ConstantVector::get(CV1); 5741 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5742 5743 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5744 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5745 Op.getOperand(0), 5746 DAG.getIntPtrConstant(1))); 5747 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5748 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5749 Op.getOperand(0), 5750 DAG.getIntPtrConstant(0))); 5751 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5752 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5753 PseudoSourceValue::getConstantPool(), 0, 5754 false, false, 16); 5755 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5756 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5757 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5758 PseudoSourceValue::getConstantPool(), 0, 5759 false, false, 16); 5760 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5761 5762 // Add the halves; easiest way is to swap them into another reg first. 5763 int ShufMask[2] = { 1, -1 }; 5764 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5765 DAG.getUNDEF(MVT::v2f64), ShufMask); 5766 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5767 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5768 DAG.getIntPtrConstant(0)); 5769} 5770 5771// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5772SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 5773 SelectionDAG &DAG) const { 5774 DebugLoc dl = Op.getDebugLoc(); 5775 // FP constant to bias correct the final result. 5776 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5777 MVT::f64); 5778 5779 // Load the 32-bit value into an XMM register. 5780 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5781 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5782 Op.getOperand(0), 5783 DAG.getIntPtrConstant(0))); 5784 5785 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5786 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5787 DAG.getIntPtrConstant(0)); 5788 5789 // Or the load with the bias. 5790 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5791 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5792 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5793 MVT::v2f64, Load)), 5794 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5795 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5796 MVT::v2f64, Bias))); 5797 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5798 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5799 DAG.getIntPtrConstant(0)); 5800 5801 // Subtract the bias. 5802 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5803 5804 // Handle final rounding. 5805 EVT DestVT = Op.getValueType(); 5806 5807 if (DestVT.bitsLT(MVT::f64)) { 5808 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5809 DAG.getIntPtrConstant(0)); 5810 } else if (DestVT.bitsGT(MVT::f64)) { 5811 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5812 } 5813 5814 // Handle final rounding. 5815 return Sub; 5816} 5817 5818SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 5819 SelectionDAG &DAG) const { 5820 SDValue N0 = Op.getOperand(0); 5821 DebugLoc dl = Op.getDebugLoc(); 5822 5823 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 5824 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5825 // the optimization here. 5826 if (DAG.SignBitIsZero(N0)) 5827 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5828 5829 EVT SrcVT = N0.getValueType(); 5830 EVT DstVT = Op.getValueType(); 5831 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 5832 return LowerUINT_TO_FP_i64(Op, DAG); 5833 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 5834 return LowerUINT_TO_FP_i32(Op, DAG); 5835 5836 // Make a 64-bit buffer, and use it to build an FILD. 5837 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5838 if (SrcVT == MVT::i32) { 5839 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5840 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5841 getPointerTy(), StackSlot, WordOff); 5842 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5843 StackSlot, NULL, 0, false, false, 0); 5844 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5845 OffsetSlot, NULL, 0, false, false, 0); 5846 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5847 return Fild; 5848 } 5849 5850 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 5851 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5852 StackSlot, NULL, 0, false, false, 0); 5853 // For i64 source, we need to add the appropriate power of 2 if the input 5854 // was negative. This is the same as the optimization in 5855 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 5856 // we must be careful to do the computation in x87 extended precision, not 5857 // in SSE. (The generic code can't know it's OK to do this, or how to.) 5858 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 5859 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 5860 SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); 5861 5862 APInt FF(32, 0x5F800000ULL); 5863 5864 // Check whether the sign bit is set. 5865 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 5866 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 5867 ISD::SETLT); 5868 5869 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 5870 SDValue FudgePtr = DAG.getConstantPool( 5871 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 5872 getPointerTy()); 5873 5874 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 5875 SDValue Zero = DAG.getIntPtrConstant(0); 5876 SDValue Four = DAG.getIntPtrConstant(4); 5877 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 5878 Zero, Four); 5879 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 5880 5881 // Load the value out, extending it from f32 to f80. 5882 // FIXME: Avoid the extend by constructing the right constant pool? 5883 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 5884 FudgePtr, PseudoSourceValue::getConstantPool(), 5885 0, MVT::f32, false, false, 4); 5886 // Extend everything to 80 bits to force it to be done on x87. 5887 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 5888 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 5889} 5890 5891std::pair<SDValue,SDValue> X86TargetLowering:: 5892FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 5893 DebugLoc dl = Op.getDebugLoc(); 5894 5895 EVT DstTy = Op.getValueType(); 5896 5897 if (!IsSigned) { 5898 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5899 DstTy = MVT::i64; 5900 } 5901 5902 assert(DstTy.getSimpleVT() <= MVT::i64 && 5903 DstTy.getSimpleVT() >= MVT::i16 && 5904 "Unknown FP_TO_SINT to lower!"); 5905 5906 // These are really Legal. 5907 if (DstTy == MVT::i32 && 5908 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5909 return std::make_pair(SDValue(), SDValue()); 5910 if (Subtarget->is64Bit() && 5911 DstTy == MVT::i64 && 5912 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5913 return std::make_pair(SDValue(), SDValue()); 5914 5915 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5916 // stack slot. 5917 MachineFunction &MF = DAG.getMachineFunction(); 5918 unsigned MemSize = DstTy.getSizeInBits()/8; 5919 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5920 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5921 5922 unsigned Opc; 5923 switch (DstTy.getSimpleVT().SimpleTy) { 5924 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5925 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5926 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5927 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5928 } 5929 5930 SDValue Chain = DAG.getEntryNode(); 5931 SDValue Value = Op.getOperand(0); 5932 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5933 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5934 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5935 PseudoSourceValue::getFixedStack(SSFI), 0, 5936 false, false, 0); 5937 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5938 SDValue Ops[] = { 5939 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5940 }; 5941 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5942 Chain = Value.getValue(1); 5943 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5944 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5945 } 5946 5947 // Build the FP_TO_INT*_IN_MEM 5948 SDValue Ops[] = { Chain, Value, StackSlot }; 5949 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5950 5951 return std::make_pair(FIST, StackSlot); 5952} 5953 5954SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 5955 SelectionDAG &DAG) const { 5956 if (Op.getValueType().isVector()) { 5957 if (Op.getValueType() == MVT::v2i32 && 5958 Op.getOperand(0).getValueType() == MVT::v2f64) { 5959 return Op; 5960 } 5961 return SDValue(); 5962 } 5963 5964 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5965 SDValue FIST = Vals.first, StackSlot = Vals.second; 5966 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5967 if (FIST.getNode() == 0) return Op; 5968 5969 // Load the result. 5970 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5971 FIST, StackSlot, NULL, 0, false, false, 0); 5972} 5973 5974SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 5975 SelectionDAG &DAG) const { 5976 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5977 SDValue FIST = Vals.first, StackSlot = Vals.second; 5978 assert(FIST.getNode() && "Unexpected failure"); 5979 5980 // Load the result. 5981 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5982 FIST, StackSlot, NULL, 0, false, false, 0); 5983} 5984 5985SDValue X86TargetLowering::LowerFABS(SDValue Op, 5986 SelectionDAG &DAG) const { 5987 LLVMContext *Context = DAG.getContext(); 5988 DebugLoc dl = Op.getDebugLoc(); 5989 EVT VT = Op.getValueType(); 5990 EVT EltVT = VT; 5991 if (VT.isVector()) 5992 EltVT = VT.getVectorElementType(); 5993 std::vector<Constant*> CV; 5994 if (EltVT == MVT::f64) { 5995 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5996 CV.push_back(C); 5997 CV.push_back(C); 5998 } else { 5999 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 6000 CV.push_back(C); 6001 CV.push_back(C); 6002 CV.push_back(C); 6003 CV.push_back(C); 6004 } 6005 Constant *C = ConstantVector::get(CV); 6006 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6007 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6008 PseudoSourceValue::getConstantPool(), 0, 6009 false, false, 16); 6010 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 6011} 6012 6013SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 6014 LLVMContext *Context = DAG.getContext(); 6015 DebugLoc dl = Op.getDebugLoc(); 6016 EVT VT = Op.getValueType(); 6017 EVT EltVT = VT; 6018 if (VT.isVector()) 6019 EltVT = VT.getVectorElementType(); 6020 std::vector<Constant*> CV; 6021 if (EltVT == MVT::f64) { 6022 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6023 CV.push_back(C); 6024 CV.push_back(C); 6025 } else { 6026 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6027 CV.push_back(C); 6028 CV.push_back(C); 6029 CV.push_back(C); 6030 CV.push_back(C); 6031 } 6032 Constant *C = ConstantVector::get(CV); 6033 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6034 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6035 PseudoSourceValue::getConstantPool(), 0, 6036 false, false, 16); 6037 if (VT.isVector()) { 6038 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6039 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6040 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6041 Op.getOperand(0)), 6042 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 6043 } else { 6044 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6045 } 6046} 6047 6048SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6049 LLVMContext *Context = DAG.getContext(); 6050 SDValue Op0 = Op.getOperand(0); 6051 SDValue Op1 = Op.getOperand(1); 6052 DebugLoc dl = Op.getDebugLoc(); 6053 EVT VT = Op.getValueType(); 6054 EVT SrcVT = Op1.getValueType(); 6055 6056 // If second operand is smaller, extend it first. 6057 if (SrcVT.bitsLT(VT)) { 6058 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6059 SrcVT = VT; 6060 } 6061 // And if it is bigger, shrink it first. 6062 if (SrcVT.bitsGT(VT)) { 6063 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6064 SrcVT = VT; 6065 } 6066 6067 // At this point the operands and the result should have the same 6068 // type, and that won't be f80 since that is not custom lowered. 6069 6070 // First get the sign bit of second operand. 6071 std::vector<Constant*> CV; 6072 if (SrcVT == MVT::f64) { 6073 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6074 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6075 } else { 6076 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6077 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6078 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6079 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6080 } 6081 Constant *C = ConstantVector::get(CV); 6082 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6083 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6084 PseudoSourceValue::getConstantPool(), 0, 6085 false, false, 16); 6086 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6087 6088 // Shift sign bit right or left if the two operands have different types. 6089 if (SrcVT.bitsGT(VT)) { 6090 // Op0 is MVT::f32, Op1 is MVT::f64. 6091 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6092 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6093 DAG.getConstant(32, MVT::i32)); 6094 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 6095 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6096 DAG.getIntPtrConstant(0)); 6097 } 6098 6099 // Clear first operand sign bit. 6100 CV.clear(); 6101 if (VT == MVT::f64) { 6102 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6103 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6104 } else { 6105 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6106 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6107 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6108 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6109 } 6110 C = ConstantVector::get(CV); 6111 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6112 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6113 PseudoSourceValue::getConstantPool(), 0, 6114 false, false, 16); 6115 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6116 6117 // Or the value with the sign bit. 6118 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6119} 6120 6121/// Emit nodes that will be selected as "test Op0,Op0", or something 6122/// equivalent. 6123SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6124 SelectionDAG &DAG) const { 6125 DebugLoc dl = Op.getDebugLoc(); 6126 6127 // CF and OF aren't always set the way we want. Determine which 6128 // of these we need. 6129 bool NeedCF = false; 6130 bool NeedOF = false; 6131 switch (X86CC) { 6132 default: break; 6133 case X86::COND_A: case X86::COND_AE: 6134 case X86::COND_B: case X86::COND_BE: 6135 NeedCF = true; 6136 break; 6137 case X86::COND_G: case X86::COND_GE: 6138 case X86::COND_L: case X86::COND_LE: 6139 case X86::COND_O: case X86::COND_NO: 6140 NeedOF = true; 6141 break; 6142 } 6143 6144 // See if we can use the EFLAGS value from the operand instead of 6145 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6146 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6147 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6148 // Emit a CMP with 0, which is the TEST pattern. 6149 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6150 DAG.getConstant(0, Op.getValueType())); 6151 6152 unsigned Opcode = 0; 6153 unsigned NumOperands = 0; 6154 switch (Op.getNode()->getOpcode()) { 6155 case ISD::ADD: 6156 // Due to an isel shortcoming, be conservative if this add is likely to be 6157 // selected as part of a load-modify-store instruction. When the root node 6158 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6159 // uses of other nodes in the match, such as the ADD in this case. This 6160 // leads to the ADD being left around and reselected, with the result being 6161 // two adds in the output. Alas, even if none our users are stores, that 6162 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6163 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6164 // climbing the DAG back to the root, and it doesn't seem to be worth the 6165 // effort. 6166 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6167 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6168 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6169 goto default_case; 6170 6171 if (ConstantSDNode *C = 6172 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6173 // An add of one will be selected as an INC. 6174 if (C->getAPIntValue() == 1) { 6175 Opcode = X86ISD::INC; 6176 NumOperands = 1; 6177 break; 6178 } 6179 6180 // An add of negative one (subtract of one) will be selected as a DEC. 6181 if (C->getAPIntValue().isAllOnesValue()) { 6182 Opcode = X86ISD::DEC; 6183 NumOperands = 1; 6184 break; 6185 } 6186 } 6187 6188 // Otherwise use a regular EFLAGS-setting add. 6189 Opcode = X86ISD::ADD; 6190 NumOperands = 2; 6191 break; 6192 case ISD::AND: { 6193 // If the primary and result isn't used, don't bother using X86ISD::AND, 6194 // because a TEST instruction will be better. 6195 bool NonFlagUse = false; 6196 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6197 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6198 SDNode *User = *UI; 6199 unsigned UOpNo = UI.getOperandNo(); 6200 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6201 // Look pass truncate. 6202 UOpNo = User->use_begin().getOperandNo(); 6203 User = *User->use_begin(); 6204 } 6205 6206 if (User->getOpcode() != ISD::BRCOND && 6207 User->getOpcode() != ISD::SETCC && 6208 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6209 NonFlagUse = true; 6210 break; 6211 } 6212 } 6213 6214 if (!NonFlagUse) 6215 break; 6216 } 6217 // FALL THROUGH 6218 case ISD::SUB: 6219 case ISD::OR: 6220 case ISD::XOR: 6221 // Due to the ISEL shortcoming noted above, be conservative if this op is 6222 // likely to be selected as part of a load-modify-store instruction. 6223 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6224 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6225 if (UI->getOpcode() == ISD::STORE) 6226 goto default_case; 6227 6228 // Otherwise use a regular EFLAGS-setting instruction. 6229 switch (Op.getNode()->getOpcode()) { 6230 default: llvm_unreachable("unexpected operator!"); 6231 case ISD::SUB: Opcode = X86ISD::SUB; break; 6232 case ISD::OR: Opcode = X86ISD::OR; break; 6233 case ISD::XOR: Opcode = X86ISD::XOR; break; 6234 case ISD::AND: Opcode = X86ISD::AND; break; 6235 } 6236 6237 NumOperands = 2; 6238 break; 6239 case X86ISD::ADD: 6240 case X86ISD::SUB: 6241 case X86ISD::INC: 6242 case X86ISD::DEC: 6243 case X86ISD::OR: 6244 case X86ISD::XOR: 6245 case X86ISD::AND: 6246 return SDValue(Op.getNode(), 1); 6247 default: 6248 default_case: 6249 break; 6250 } 6251 6252 if (Opcode == 0) 6253 // Emit a CMP with 0, which is the TEST pattern. 6254 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6255 DAG.getConstant(0, Op.getValueType())); 6256 6257 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6258 SmallVector<SDValue, 4> Ops; 6259 for (unsigned i = 0; i != NumOperands; ++i) 6260 Ops.push_back(Op.getOperand(i)); 6261 6262 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6263 DAG.ReplaceAllUsesWith(Op, New); 6264 return SDValue(New.getNode(), 1); 6265} 6266 6267/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6268/// equivalent. 6269SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6270 SelectionDAG &DAG) const { 6271 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6272 if (C->getAPIntValue() == 0) 6273 return EmitTest(Op0, X86CC, DAG); 6274 6275 DebugLoc dl = Op0.getDebugLoc(); 6276 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6277} 6278 6279/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6280/// if it's possible. 6281SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6282 DebugLoc dl, SelectionDAG &DAG) const { 6283 SDValue Op0 = And.getOperand(0); 6284 SDValue Op1 = And.getOperand(1); 6285 if (Op0.getOpcode() == ISD::TRUNCATE) 6286 Op0 = Op0.getOperand(0); 6287 if (Op1.getOpcode() == ISD::TRUNCATE) 6288 Op1 = Op1.getOperand(0); 6289 6290 SDValue LHS, RHS; 6291 if (Op1.getOpcode() == ISD::SHL) 6292 std::swap(Op0, Op1); 6293 if (Op0.getOpcode() == ISD::SHL) { 6294 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6295 if (And00C->getZExtValue() == 1) { 6296 // If we looked past a truncate, check that it's only truncating away 6297 // known zeros. 6298 unsigned BitWidth = Op0.getValueSizeInBits(); 6299 unsigned AndBitWidth = And.getValueSizeInBits(); 6300 if (BitWidth > AndBitWidth) { 6301 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6302 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6303 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6304 return SDValue(); 6305 } 6306 LHS = Op1; 6307 RHS = Op0.getOperand(1); 6308 } 6309 } else if (Op1.getOpcode() == ISD::Constant) { 6310 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6311 SDValue AndLHS = Op0; 6312 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6313 LHS = AndLHS.getOperand(0); 6314 RHS = AndLHS.getOperand(1); 6315 } 6316 } 6317 6318 if (LHS.getNode()) { 6319 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6320 // instruction. Since the shift amount is in-range-or-undefined, we know 6321 // that doing a bittest on the i32 value is ok. We extend to i32 because 6322 // the encoding for the i16 version is larger than the i32 version. 6323 // Also promote i16 to i32 for performance / code size reason. 6324 if (LHS.getValueType() == MVT::i8 || 6325 LHS.getValueType() == MVT::i16) 6326 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6327 6328 // If the operand types disagree, extend the shift amount to match. Since 6329 // BT ignores high bits (like shifts) we can use anyextend. 6330 if (LHS.getValueType() != RHS.getValueType()) 6331 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6332 6333 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6334 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6335 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6336 DAG.getConstant(Cond, MVT::i8), BT); 6337 } 6338 6339 return SDValue(); 6340} 6341 6342SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 6343 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6344 SDValue Op0 = Op.getOperand(0); 6345 SDValue Op1 = Op.getOperand(1); 6346 DebugLoc dl = Op.getDebugLoc(); 6347 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6348 6349 // Optimize to BT if possible. 6350 // Lower (X & (1 << N)) == 0 to BT(X, N). 6351 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6352 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6353 if (Op0.getOpcode() == ISD::AND && 6354 Op0.hasOneUse() && 6355 Op1.getOpcode() == ISD::Constant && 6356 cast<ConstantSDNode>(Op1)->isNullValue() && 6357 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6358 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6359 if (NewSetCC.getNode()) 6360 return NewSetCC; 6361 } 6362 6363 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6364 if (Op0.getOpcode() == X86ISD::SETCC && 6365 Op1.getOpcode() == ISD::Constant && 6366 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6367 cast<ConstantSDNode>(Op1)->isNullValue()) && 6368 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6369 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6370 bool Invert = (CC == ISD::SETNE) ^ 6371 cast<ConstantSDNode>(Op1)->isNullValue(); 6372 if (Invert) 6373 CCode = X86::GetOppositeBranchCondition(CCode); 6374 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6375 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6376 } 6377 6378 bool isFP = Op1.getValueType().isFloatingPoint(); 6379 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6380 if (X86CC == X86::COND_INVALID) 6381 return SDValue(); 6382 6383 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6384 6385 // Use sbb x, x to materialize carry bit into a GPR. 6386 if (X86CC == X86::COND_B) 6387 return DAG.getNode(ISD::AND, dl, MVT::i8, 6388 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6389 DAG.getConstant(X86CC, MVT::i8), Cond), 6390 DAG.getConstant(1, MVT::i8)); 6391 6392 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6393 DAG.getConstant(X86CC, MVT::i8), Cond); 6394} 6395 6396SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 6397 SDValue Cond; 6398 SDValue Op0 = Op.getOperand(0); 6399 SDValue Op1 = Op.getOperand(1); 6400 SDValue CC = Op.getOperand(2); 6401 EVT VT = Op.getValueType(); 6402 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6403 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6404 DebugLoc dl = Op.getDebugLoc(); 6405 6406 if (isFP) { 6407 unsigned SSECC = 8; 6408 EVT VT0 = Op0.getValueType(); 6409 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6410 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6411 bool Swap = false; 6412 6413 switch (SetCCOpcode) { 6414 default: break; 6415 case ISD::SETOEQ: 6416 case ISD::SETEQ: SSECC = 0; break; 6417 case ISD::SETOGT: 6418 case ISD::SETGT: Swap = true; // Fallthrough 6419 case ISD::SETLT: 6420 case ISD::SETOLT: SSECC = 1; break; 6421 case ISD::SETOGE: 6422 case ISD::SETGE: Swap = true; // Fallthrough 6423 case ISD::SETLE: 6424 case ISD::SETOLE: SSECC = 2; break; 6425 case ISD::SETUO: SSECC = 3; break; 6426 case ISD::SETUNE: 6427 case ISD::SETNE: SSECC = 4; break; 6428 case ISD::SETULE: Swap = true; 6429 case ISD::SETUGE: SSECC = 5; break; 6430 case ISD::SETULT: Swap = true; 6431 case ISD::SETUGT: SSECC = 6; break; 6432 case ISD::SETO: SSECC = 7; break; 6433 } 6434 if (Swap) 6435 std::swap(Op0, Op1); 6436 6437 // In the two special cases we can't handle, emit two comparisons. 6438 if (SSECC == 8) { 6439 if (SetCCOpcode == ISD::SETUEQ) { 6440 SDValue UNORD, EQ; 6441 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6442 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6443 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6444 } 6445 else if (SetCCOpcode == ISD::SETONE) { 6446 SDValue ORD, NEQ; 6447 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6448 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6449 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6450 } 6451 llvm_unreachable("Illegal FP comparison"); 6452 } 6453 // Handle all other FP comparisons here. 6454 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6455 } 6456 6457 // We are handling one of the integer comparisons here. Since SSE only has 6458 // GT and EQ comparisons for integer, swapping operands and multiple 6459 // operations may be required for some comparisons. 6460 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6461 bool Swap = false, Invert = false, FlipSigns = false; 6462 6463 switch (VT.getSimpleVT().SimpleTy) { 6464 default: break; 6465 case MVT::v8i8: 6466 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6467 case MVT::v4i16: 6468 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6469 case MVT::v2i32: 6470 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6471 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6472 } 6473 6474 switch (SetCCOpcode) { 6475 default: break; 6476 case ISD::SETNE: Invert = true; 6477 case ISD::SETEQ: Opc = EQOpc; break; 6478 case ISD::SETLT: Swap = true; 6479 case ISD::SETGT: Opc = GTOpc; break; 6480 case ISD::SETGE: Swap = true; 6481 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6482 case ISD::SETULT: Swap = true; 6483 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6484 case ISD::SETUGE: Swap = true; 6485 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6486 } 6487 if (Swap) 6488 std::swap(Op0, Op1); 6489 6490 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6491 // bits of the inputs before performing those operations. 6492 if (FlipSigns) { 6493 EVT EltVT = VT.getVectorElementType(); 6494 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6495 EltVT); 6496 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6497 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6498 SignBits.size()); 6499 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6500 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6501 } 6502 6503 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6504 6505 // If the logical-not of the result is required, perform that now. 6506 if (Invert) 6507 Result = DAG.getNOT(dl, Result, VT); 6508 6509 return Result; 6510} 6511 6512// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6513static bool isX86LogicalCmp(SDValue Op) { 6514 unsigned Opc = Op.getNode()->getOpcode(); 6515 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6516 return true; 6517 if (Op.getResNo() == 1 && 6518 (Opc == X86ISD::ADD || 6519 Opc == X86ISD::SUB || 6520 Opc == X86ISD::SMUL || 6521 Opc == X86ISD::UMUL || 6522 Opc == X86ISD::INC || 6523 Opc == X86ISD::DEC || 6524 Opc == X86ISD::OR || 6525 Opc == X86ISD::XOR || 6526 Opc == X86ISD::AND)) 6527 return true; 6528 6529 return false; 6530} 6531 6532SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 6533 bool addTest = true; 6534 SDValue Cond = Op.getOperand(0); 6535 DebugLoc dl = Op.getDebugLoc(); 6536 SDValue CC; 6537 6538 if (Cond.getOpcode() == ISD::SETCC) { 6539 SDValue NewCond = LowerSETCC(Cond, DAG); 6540 if (NewCond.getNode()) 6541 Cond = NewCond; 6542 } 6543 6544 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6545 SDValue Op1 = Op.getOperand(1); 6546 SDValue Op2 = Op.getOperand(2); 6547 if (Cond.getOpcode() == X86ISD::SETCC && 6548 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6549 SDValue Cmp = Cond.getOperand(1); 6550 if (Cmp.getOpcode() == X86ISD::CMP) { 6551 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6552 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6553 ConstantSDNode *RHSC = 6554 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6555 if (N1C && N1C->isAllOnesValue() && 6556 N2C && N2C->isNullValue() && 6557 RHSC && RHSC->isNullValue()) { 6558 SDValue CmpOp0 = Cmp.getOperand(0); 6559 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6560 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6561 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6562 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6563 } 6564 } 6565 } 6566 6567 // Look pass (and (setcc_carry (cmp ...)), 1). 6568 if (Cond.getOpcode() == ISD::AND && 6569 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6570 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6571 if (C && C->getAPIntValue() == 1) 6572 Cond = Cond.getOperand(0); 6573 } 6574 6575 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6576 // setting operand in place of the X86ISD::SETCC. 6577 if (Cond.getOpcode() == X86ISD::SETCC || 6578 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6579 CC = Cond.getOperand(0); 6580 6581 SDValue Cmp = Cond.getOperand(1); 6582 unsigned Opc = Cmp.getOpcode(); 6583 EVT VT = Op.getValueType(); 6584 6585 bool IllegalFPCMov = false; 6586 if (VT.isFloatingPoint() && !VT.isVector() && 6587 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6588 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6589 6590 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6591 Opc == X86ISD::BT) { // FIXME 6592 Cond = Cmp; 6593 addTest = false; 6594 } 6595 } 6596 6597 if (addTest) { 6598 // Look pass the truncate. 6599 if (Cond.getOpcode() == ISD::TRUNCATE) 6600 Cond = Cond.getOperand(0); 6601 6602 // We know the result of AND is compared against zero. Try to match 6603 // it to BT. 6604 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6605 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6606 if (NewSetCC.getNode()) { 6607 CC = NewSetCC.getOperand(0); 6608 Cond = NewSetCC.getOperand(1); 6609 addTest = false; 6610 } 6611 } 6612 } 6613 6614 if (addTest) { 6615 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6616 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6617 } 6618 6619 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6620 // condition is true. 6621 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6622 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6623 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6624} 6625 6626// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6627// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6628// from the AND / OR. 6629static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6630 Opc = Op.getOpcode(); 6631 if (Opc != ISD::OR && Opc != ISD::AND) 6632 return false; 6633 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6634 Op.getOperand(0).hasOneUse() && 6635 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6636 Op.getOperand(1).hasOneUse()); 6637} 6638 6639// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6640// 1 and that the SETCC node has a single use. 6641static bool isXor1OfSetCC(SDValue Op) { 6642 if (Op.getOpcode() != ISD::XOR) 6643 return false; 6644 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6645 if (N1C && N1C->getAPIntValue() == 1) { 6646 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6647 Op.getOperand(0).hasOneUse(); 6648 } 6649 return false; 6650} 6651 6652SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 6653 bool addTest = true; 6654 SDValue Chain = Op.getOperand(0); 6655 SDValue Cond = Op.getOperand(1); 6656 SDValue Dest = Op.getOperand(2); 6657 DebugLoc dl = Op.getDebugLoc(); 6658 SDValue CC; 6659 6660 if (Cond.getOpcode() == ISD::SETCC) { 6661 SDValue NewCond = LowerSETCC(Cond, DAG); 6662 if (NewCond.getNode()) 6663 Cond = NewCond; 6664 } 6665#if 0 6666 // FIXME: LowerXALUO doesn't handle these!! 6667 else if (Cond.getOpcode() == X86ISD::ADD || 6668 Cond.getOpcode() == X86ISD::SUB || 6669 Cond.getOpcode() == X86ISD::SMUL || 6670 Cond.getOpcode() == X86ISD::UMUL) 6671 Cond = LowerXALUO(Cond, DAG); 6672#endif 6673 6674 // Look pass (and (setcc_carry (cmp ...)), 1). 6675 if (Cond.getOpcode() == ISD::AND && 6676 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6677 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6678 if (C && C->getAPIntValue() == 1) 6679 Cond = Cond.getOperand(0); 6680 } 6681 6682 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6683 // setting operand in place of the X86ISD::SETCC. 6684 if (Cond.getOpcode() == X86ISD::SETCC || 6685 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6686 CC = Cond.getOperand(0); 6687 6688 SDValue Cmp = Cond.getOperand(1); 6689 unsigned Opc = Cmp.getOpcode(); 6690 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6691 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6692 Cond = Cmp; 6693 addTest = false; 6694 } else { 6695 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6696 default: break; 6697 case X86::COND_O: 6698 case X86::COND_B: 6699 // These can only come from an arithmetic instruction with overflow, 6700 // e.g. SADDO, UADDO. 6701 Cond = Cond.getNode()->getOperand(1); 6702 addTest = false; 6703 break; 6704 } 6705 } 6706 } else { 6707 unsigned CondOpc; 6708 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6709 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6710 if (CondOpc == ISD::OR) { 6711 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6712 // two branches instead of an explicit OR instruction with a 6713 // separate test. 6714 if (Cmp == Cond.getOperand(1).getOperand(1) && 6715 isX86LogicalCmp(Cmp)) { 6716 CC = Cond.getOperand(0).getOperand(0); 6717 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6718 Chain, Dest, CC, Cmp); 6719 CC = Cond.getOperand(1).getOperand(0); 6720 Cond = Cmp; 6721 addTest = false; 6722 } 6723 } else { // ISD::AND 6724 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6725 // two branches instead of an explicit AND instruction with a 6726 // separate test. However, we only do this if this block doesn't 6727 // have a fall-through edge, because this requires an explicit 6728 // jmp when the condition is false. 6729 if (Cmp == Cond.getOperand(1).getOperand(1) && 6730 isX86LogicalCmp(Cmp) && 6731 Op.getNode()->hasOneUse()) { 6732 X86::CondCode CCode = 6733 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6734 CCode = X86::GetOppositeBranchCondition(CCode); 6735 CC = DAG.getConstant(CCode, MVT::i8); 6736 SDNode *User = *Op.getNode()->use_begin(); 6737 // Look for an unconditional branch following this conditional branch. 6738 // We need this because we need to reverse the successors in order 6739 // to implement FCMP_OEQ. 6740 if (User->getOpcode() == ISD::BR) { 6741 SDValue FalseBB = User->getOperand(1); 6742 SDNode *NewBR = 6743 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 6744 assert(NewBR == User); 6745 (void)NewBR; 6746 Dest = FalseBB; 6747 6748 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6749 Chain, Dest, CC, Cmp); 6750 X86::CondCode CCode = 6751 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6752 CCode = X86::GetOppositeBranchCondition(CCode); 6753 CC = DAG.getConstant(CCode, MVT::i8); 6754 Cond = Cmp; 6755 addTest = false; 6756 } 6757 } 6758 } 6759 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6760 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6761 // It should be transformed during dag combiner except when the condition 6762 // is set by a arithmetics with overflow node. 6763 X86::CondCode CCode = 6764 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6765 CCode = X86::GetOppositeBranchCondition(CCode); 6766 CC = DAG.getConstant(CCode, MVT::i8); 6767 Cond = Cond.getOperand(0).getOperand(1); 6768 addTest = false; 6769 } 6770 } 6771 6772 if (addTest) { 6773 // Look pass the truncate. 6774 if (Cond.getOpcode() == ISD::TRUNCATE) 6775 Cond = Cond.getOperand(0); 6776 6777 // We know the result of AND is compared against zero. Try to match 6778 // it to BT. 6779 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6780 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6781 if (NewSetCC.getNode()) { 6782 CC = NewSetCC.getOperand(0); 6783 Cond = NewSetCC.getOperand(1); 6784 addTest = false; 6785 } 6786 } 6787 } 6788 6789 if (addTest) { 6790 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6791 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6792 } 6793 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6794 Chain, Dest, CC, Cond); 6795} 6796 6797 6798// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6799// Calls to _alloca is needed to probe the stack when allocating more than 4k 6800// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6801// that the guard pages used by the OS virtual memory manager are allocated in 6802// correct sequence. 6803SDValue 6804X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6805 SelectionDAG &DAG) const { 6806 assert(Subtarget->isTargetCygMing() && 6807 "This should be used only on Cygwin/Mingw targets"); 6808 DebugLoc dl = Op.getDebugLoc(); 6809 6810 // Get the inputs. 6811 SDValue Chain = Op.getOperand(0); 6812 SDValue Size = Op.getOperand(1); 6813 // FIXME: Ensure alignment here 6814 6815 SDValue Flag; 6816 6817 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6818 6819 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6820 Flag = Chain.getValue(1); 6821 6822 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6823 6824 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 6825 Flag = Chain.getValue(1); 6826 6827 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6828 6829 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6830 return DAG.getMergeValues(Ops1, 2, dl); 6831} 6832 6833SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 6834 MachineFunction &MF = DAG.getMachineFunction(); 6835 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 6836 6837 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6838 DebugLoc dl = Op.getDebugLoc(); 6839 6840 if (!Subtarget->is64Bit()) { 6841 // vastart just stores the address of the VarArgsFrameIndex slot into the 6842 // memory location argument. 6843 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6844 getPointerTy()); 6845 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6846 false, false, 0); 6847 } 6848 6849 // __va_list_tag: 6850 // gp_offset (0 - 6 * 8) 6851 // fp_offset (48 - 48 + 8 * 16) 6852 // overflow_arg_area (point to parameters coming in memory). 6853 // reg_save_area 6854 SmallVector<SDValue, 8> MemOps; 6855 SDValue FIN = Op.getOperand(1); 6856 // Store gp_offset 6857 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6858 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 6859 MVT::i32), 6860 FIN, SV, 0, false, false, 0); 6861 MemOps.push_back(Store); 6862 6863 // Store fp_offset 6864 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6865 FIN, DAG.getIntPtrConstant(4)); 6866 Store = DAG.getStore(Op.getOperand(0), dl, 6867 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 6868 MVT::i32), 6869 FIN, SV, 4, false, false, 0); 6870 MemOps.push_back(Store); 6871 6872 // Store ptr to overflow_arg_area 6873 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6874 FIN, DAG.getIntPtrConstant(4)); 6875 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6876 getPointerTy()); 6877 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8, 6878 false, false, 0); 6879 MemOps.push_back(Store); 6880 6881 // Store ptr to reg_save_area. 6882 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6883 FIN, DAG.getIntPtrConstant(8)); 6884 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 6885 getPointerTy()); 6886 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16, 6887 false, false, 0); 6888 MemOps.push_back(Store); 6889 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6890 &MemOps[0], MemOps.size()); 6891} 6892 6893SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 6894 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6895 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6896 6897 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 6898 return SDValue(); 6899} 6900 6901SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 6902 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6903 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6904 SDValue Chain = Op.getOperand(0); 6905 SDValue DstPtr = Op.getOperand(1); 6906 SDValue SrcPtr = Op.getOperand(2); 6907 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6908 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6909 DebugLoc dl = Op.getDebugLoc(); 6910 6911 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6912 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 6913 false, DstSV, 0, SrcSV, 0); 6914} 6915 6916SDValue 6917X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 6918 DebugLoc dl = Op.getDebugLoc(); 6919 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6920 switch (IntNo) { 6921 default: return SDValue(); // Don't custom lower most intrinsics. 6922 // Comparison intrinsics. 6923 case Intrinsic::x86_sse_comieq_ss: 6924 case Intrinsic::x86_sse_comilt_ss: 6925 case Intrinsic::x86_sse_comile_ss: 6926 case Intrinsic::x86_sse_comigt_ss: 6927 case Intrinsic::x86_sse_comige_ss: 6928 case Intrinsic::x86_sse_comineq_ss: 6929 case Intrinsic::x86_sse_ucomieq_ss: 6930 case Intrinsic::x86_sse_ucomilt_ss: 6931 case Intrinsic::x86_sse_ucomile_ss: 6932 case Intrinsic::x86_sse_ucomigt_ss: 6933 case Intrinsic::x86_sse_ucomige_ss: 6934 case Intrinsic::x86_sse_ucomineq_ss: 6935 case Intrinsic::x86_sse2_comieq_sd: 6936 case Intrinsic::x86_sse2_comilt_sd: 6937 case Intrinsic::x86_sse2_comile_sd: 6938 case Intrinsic::x86_sse2_comigt_sd: 6939 case Intrinsic::x86_sse2_comige_sd: 6940 case Intrinsic::x86_sse2_comineq_sd: 6941 case Intrinsic::x86_sse2_ucomieq_sd: 6942 case Intrinsic::x86_sse2_ucomilt_sd: 6943 case Intrinsic::x86_sse2_ucomile_sd: 6944 case Intrinsic::x86_sse2_ucomigt_sd: 6945 case Intrinsic::x86_sse2_ucomige_sd: 6946 case Intrinsic::x86_sse2_ucomineq_sd: { 6947 unsigned Opc = 0; 6948 ISD::CondCode CC = ISD::SETCC_INVALID; 6949 switch (IntNo) { 6950 default: break; 6951 case Intrinsic::x86_sse_comieq_ss: 6952 case Intrinsic::x86_sse2_comieq_sd: 6953 Opc = X86ISD::COMI; 6954 CC = ISD::SETEQ; 6955 break; 6956 case Intrinsic::x86_sse_comilt_ss: 6957 case Intrinsic::x86_sse2_comilt_sd: 6958 Opc = X86ISD::COMI; 6959 CC = ISD::SETLT; 6960 break; 6961 case Intrinsic::x86_sse_comile_ss: 6962 case Intrinsic::x86_sse2_comile_sd: 6963 Opc = X86ISD::COMI; 6964 CC = ISD::SETLE; 6965 break; 6966 case Intrinsic::x86_sse_comigt_ss: 6967 case Intrinsic::x86_sse2_comigt_sd: 6968 Opc = X86ISD::COMI; 6969 CC = ISD::SETGT; 6970 break; 6971 case Intrinsic::x86_sse_comige_ss: 6972 case Intrinsic::x86_sse2_comige_sd: 6973 Opc = X86ISD::COMI; 6974 CC = ISD::SETGE; 6975 break; 6976 case Intrinsic::x86_sse_comineq_ss: 6977 case Intrinsic::x86_sse2_comineq_sd: 6978 Opc = X86ISD::COMI; 6979 CC = ISD::SETNE; 6980 break; 6981 case Intrinsic::x86_sse_ucomieq_ss: 6982 case Intrinsic::x86_sse2_ucomieq_sd: 6983 Opc = X86ISD::UCOMI; 6984 CC = ISD::SETEQ; 6985 break; 6986 case Intrinsic::x86_sse_ucomilt_ss: 6987 case Intrinsic::x86_sse2_ucomilt_sd: 6988 Opc = X86ISD::UCOMI; 6989 CC = ISD::SETLT; 6990 break; 6991 case Intrinsic::x86_sse_ucomile_ss: 6992 case Intrinsic::x86_sse2_ucomile_sd: 6993 Opc = X86ISD::UCOMI; 6994 CC = ISD::SETLE; 6995 break; 6996 case Intrinsic::x86_sse_ucomigt_ss: 6997 case Intrinsic::x86_sse2_ucomigt_sd: 6998 Opc = X86ISD::UCOMI; 6999 CC = ISD::SETGT; 7000 break; 7001 case Intrinsic::x86_sse_ucomige_ss: 7002 case Intrinsic::x86_sse2_ucomige_sd: 7003 Opc = X86ISD::UCOMI; 7004 CC = ISD::SETGE; 7005 break; 7006 case Intrinsic::x86_sse_ucomineq_ss: 7007 case Intrinsic::x86_sse2_ucomineq_sd: 7008 Opc = X86ISD::UCOMI; 7009 CC = ISD::SETNE; 7010 break; 7011 } 7012 7013 SDValue LHS = Op.getOperand(1); 7014 SDValue RHS = Op.getOperand(2); 7015 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 7016 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 7017 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 7018 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7019 DAG.getConstant(X86CC, MVT::i8), Cond); 7020 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7021 } 7022 // ptest and testp intrinsics. The intrinsic these come from are designed to 7023 // return an integer value, not just an instruction so lower it to the ptest 7024 // or testp pattern and a setcc for the result. 7025 case Intrinsic::x86_sse41_ptestz: 7026 case Intrinsic::x86_sse41_ptestc: 7027 case Intrinsic::x86_sse41_ptestnzc: 7028 case Intrinsic::x86_avx_ptestz_256: 7029 case Intrinsic::x86_avx_ptestc_256: 7030 case Intrinsic::x86_avx_ptestnzc_256: 7031 case Intrinsic::x86_avx_vtestz_ps: 7032 case Intrinsic::x86_avx_vtestc_ps: 7033 case Intrinsic::x86_avx_vtestnzc_ps: 7034 case Intrinsic::x86_avx_vtestz_pd: 7035 case Intrinsic::x86_avx_vtestc_pd: 7036 case Intrinsic::x86_avx_vtestnzc_pd: 7037 case Intrinsic::x86_avx_vtestz_ps_256: 7038 case Intrinsic::x86_avx_vtestc_ps_256: 7039 case Intrinsic::x86_avx_vtestnzc_ps_256: 7040 case Intrinsic::x86_avx_vtestz_pd_256: 7041 case Intrinsic::x86_avx_vtestc_pd_256: 7042 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7043 bool IsTestPacked = false; 7044 unsigned X86CC = 0; 7045 switch (IntNo) { 7046 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7047 case Intrinsic::x86_avx_vtestz_ps: 7048 case Intrinsic::x86_avx_vtestz_pd: 7049 case Intrinsic::x86_avx_vtestz_ps_256: 7050 case Intrinsic::x86_avx_vtestz_pd_256: 7051 IsTestPacked = true; // Fallthrough 7052 case Intrinsic::x86_sse41_ptestz: 7053 case Intrinsic::x86_avx_ptestz_256: 7054 // ZF = 1 7055 X86CC = X86::COND_E; 7056 break; 7057 case Intrinsic::x86_avx_vtestc_ps: 7058 case Intrinsic::x86_avx_vtestc_pd: 7059 case Intrinsic::x86_avx_vtestc_ps_256: 7060 case Intrinsic::x86_avx_vtestc_pd_256: 7061 IsTestPacked = true; // Fallthrough 7062 case Intrinsic::x86_sse41_ptestc: 7063 case Intrinsic::x86_avx_ptestc_256: 7064 // CF = 1 7065 X86CC = X86::COND_B; 7066 break; 7067 case Intrinsic::x86_avx_vtestnzc_ps: 7068 case Intrinsic::x86_avx_vtestnzc_pd: 7069 case Intrinsic::x86_avx_vtestnzc_ps_256: 7070 case Intrinsic::x86_avx_vtestnzc_pd_256: 7071 IsTestPacked = true; // Fallthrough 7072 case Intrinsic::x86_sse41_ptestnzc: 7073 case Intrinsic::x86_avx_ptestnzc_256: 7074 // ZF and CF = 0 7075 X86CC = X86::COND_A; 7076 break; 7077 } 7078 7079 SDValue LHS = Op.getOperand(1); 7080 SDValue RHS = Op.getOperand(2); 7081 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7082 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7083 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7084 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7085 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7086 } 7087 7088 // Fix vector shift instructions where the last operand is a non-immediate 7089 // i32 value. 7090 case Intrinsic::x86_sse2_pslli_w: 7091 case Intrinsic::x86_sse2_pslli_d: 7092 case Intrinsic::x86_sse2_pslli_q: 7093 case Intrinsic::x86_sse2_psrli_w: 7094 case Intrinsic::x86_sse2_psrli_d: 7095 case Intrinsic::x86_sse2_psrli_q: 7096 case Intrinsic::x86_sse2_psrai_w: 7097 case Intrinsic::x86_sse2_psrai_d: 7098 case Intrinsic::x86_mmx_pslli_w: 7099 case Intrinsic::x86_mmx_pslli_d: 7100 case Intrinsic::x86_mmx_pslli_q: 7101 case Intrinsic::x86_mmx_psrli_w: 7102 case Intrinsic::x86_mmx_psrli_d: 7103 case Intrinsic::x86_mmx_psrli_q: 7104 case Intrinsic::x86_mmx_psrai_w: 7105 case Intrinsic::x86_mmx_psrai_d: { 7106 SDValue ShAmt = Op.getOperand(2); 7107 if (isa<ConstantSDNode>(ShAmt)) 7108 return SDValue(); 7109 7110 unsigned NewIntNo = 0; 7111 EVT ShAmtVT = MVT::v4i32; 7112 switch (IntNo) { 7113 case Intrinsic::x86_sse2_pslli_w: 7114 NewIntNo = Intrinsic::x86_sse2_psll_w; 7115 break; 7116 case Intrinsic::x86_sse2_pslli_d: 7117 NewIntNo = Intrinsic::x86_sse2_psll_d; 7118 break; 7119 case Intrinsic::x86_sse2_pslli_q: 7120 NewIntNo = Intrinsic::x86_sse2_psll_q; 7121 break; 7122 case Intrinsic::x86_sse2_psrli_w: 7123 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7124 break; 7125 case Intrinsic::x86_sse2_psrli_d: 7126 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7127 break; 7128 case Intrinsic::x86_sse2_psrli_q: 7129 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7130 break; 7131 case Intrinsic::x86_sse2_psrai_w: 7132 NewIntNo = Intrinsic::x86_sse2_psra_w; 7133 break; 7134 case Intrinsic::x86_sse2_psrai_d: 7135 NewIntNo = Intrinsic::x86_sse2_psra_d; 7136 break; 7137 default: { 7138 ShAmtVT = MVT::v2i32; 7139 switch (IntNo) { 7140 case Intrinsic::x86_mmx_pslli_w: 7141 NewIntNo = Intrinsic::x86_mmx_psll_w; 7142 break; 7143 case Intrinsic::x86_mmx_pslli_d: 7144 NewIntNo = Intrinsic::x86_mmx_psll_d; 7145 break; 7146 case Intrinsic::x86_mmx_pslli_q: 7147 NewIntNo = Intrinsic::x86_mmx_psll_q; 7148 break; 7149 case Intrinsic::x86_mmx_psrli_w: 7150 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7151 break; 7152 case Intrinsic::x86_mmx_psrli_d: 7153 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7154 break; 7155 case Intrinsic::x86_mmx_psrli_q: 7156 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7157 break; 7158 case Intrinsic::x86_mmx_psrai_w: 7159 NewIntNo = Intrinsic::x86_mmx_psra_w; 7160 break; 7161 case Intrinsic::x86_mmx_psrai_d: 7162 NewIntNo = Intrinsic::x86_mmx_psra_d; 7163 break; 7164 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7165 } 7166 break; 7167 } 7168 } 7169 7170 // The vector shift intrinsics with scalars uses 32b shift amounts but 7171 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7172 // to be zero. 7173 SDValue ShOps[4]; 7174 ShOps[0] = ShAmt; 7175 ShOps[1] = DAG.getConstant(0, MVT::i32); 7176 if (ShAmtVT == MVT::v4i32) { 7177 ShOps[2] = DAG.getUNDEF(MVT::i32); 7178 ShOps[3] = DAG.getUNDEF(MVT::i32); 7179 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7180 } else { 7181 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7182 } 7183 7184 EVT VT = Op.getValueType(); 7185 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7186 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7187 DAG.getConstant(NewIntNo, MVT::i32), 7188 Op.getOperand(1), ShAmt); 7189 } 7190 } 7191} 7192 7193SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7194 SelectionDAG &DAG) const { 7195 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7196 MFI->setReturnAddressIsTaken(true); 7197 7198 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7199 DebugLoc dl = Op.getDebugLoc(); 7200 7201 if (Depth > 0) { 7202 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7203 SDValue Offset = 7204 DAG.getConstant(TD->getPointerSize(), 7205 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7206 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7207 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7208 FrameAddr, Offset), 7209 NULL, 0, false, false, 0); 7210 } 7211 7212 // Just load the return address. 7213 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7214 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7215 RetAddrFI, NULL, 0, false, false, 0); 7216} 7217 7218SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7219 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7220 MFI->setFrameAddressIsTaken(true); 7221 7222 EVT VT = Op.getValueType(); 7223 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7224 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7225 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7226 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7227 while (Depth--) 7228 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7229 false, false, 0); 7230 return FrameAddr; 7231} 7232 7233SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7234 SelectionDAG &DAG) const { 7235 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7236} 7237 7238SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7239 MachineFunction &MF = DAG.getMachineFunction(); 7240 SDValue Chain = Op.getOperand(0); 7241 SDValue Offset = Op.getOperand(1); 7242 SDValue Handler = Op.getOperand(2); 7243 DebugLoc dl = Op.getDebugLoc(); 7244 7245 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 7246 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7247 getPointerTy()); 7248 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7249 7250 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 7251 DAG.getIntPtrConstant(TD->getPointerSize())); 7252 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7253 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7254 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7255 MF.getRegInfo().addLiveOut(StoreAddrReg); 7256 7257 return DAG.getNode(X86ISD::EH_RETURN, dl, 7258 MVT::Other, 7259 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7260} 7261 7262SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7263 SelectionDAG &DAG) const { 7264 SDValue Root = Op.getOperand(0); 7265 SDValue Trmp = Op.getOperand(1); // trampoline 7266 SDValue FPtr = Op.getOperand(2); // nested function 7267 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7268 DebugLoc dl = Op.getDebugLoc(); 7269 7270 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7271 7272 if (Subtarget->is64Bit()) { 7273 SDValue OutChains[6]; 7274 7275 // Large code-model. 7276 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7277 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7278 7279 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7280 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7281 7282 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7283 7284 // Load the pointer to the nested function into R11. 7285 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7286 SDValue Addr = Trmp; 7287 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7288 Addr, TrmpAddr, 0, false, false, 0); 7289 7290 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7291 DAG.getConstant(2, MVT::i64)); 7292 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7293 false, false, 2); 7294 7295 // Load the 'nest' parameter value into R10. 7296 // R10 is specified in X86CallingConv.td 7297 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7298 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7299 DAG.getConstant(10, MVT::i64)); 7300 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7301 Addr, TrmpAddr, 10, false, false, 0); 7302 7303 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7304 DAG.getConstant(12, MVT::i64)); 7305 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7306 false, false, 2); 7307 7308 // Jump to the nested function. 7309 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7310 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7311 DAG.getConstant(20, MVT::i64)); 7312 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7313 Addr, TrmpAddr, 20, false, false, 0); 7314 7315 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7316 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7317 DAG.getConstant(22, MVT::i64)); 7318 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7319 TrmpAddr, 22, false, false, 0); 7320 7321 SDValue Ops[] = 7322 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7323 return DAG.getMergeValues(Ops, 2, dl); 7324 } else { 7325 const Function *Func = 7326 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7327 CallingConv::ID CC = Func->getCallingConv(); 7328 unsigned NestReg; 7329 7330 switch (CC) { 7331 default: 7332 llvm_unreachable("Unsupported calling convention"); 7333 case CallingConv::C: 7334 case CallingConv::X86_StdCall: { 7335 // Pass 'nest' parameter in ECX. 7336 // Must be kept in sync with X86CallingConv.td 7337 NestReg = X86::ECX; 7338 7339 // Check that ECX wasn't needed by an 'inreg' parameter. 7340 const FunctionType *FTy = Func->getFunctionType(); 7341 const AttrListPtr &Attrs = Func->getAttributes(); 7342 7343 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7344 unsigned InRegCount = 0; 7345 unsigned Idx = 1; 7346 7347 for (FunctionType::param_iterator I = FTy->param_begin(), 7348 E = FTy->param_end(); I != E; ++I, ++Idx) 7349 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7350 // FIXME: should only count parameters that are lowered to integers. 7351 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7352 7353 if (InRegCount > 2) { 7354 report_fatal_error("Nest register in use - reduce number of inreg" 7355 " parameters!"); 7356 } 7357 } 7358 break; 7359 } 7360 case CallingConv::X86_FastCall: 7361 case CallingConv::X86_ThisCall: 7362 case CallingConv::Fast: 7363 // Pass 'nest' parameter in EAX. 7364 // Must be kept in sync with X86CallingConv.td 7365 NestReg = X86::EAX; 7366 break; 7367 } 7368 7369 SDValue OutChains[4]; 7370 SDValue Addr, Disp; 7371 7372 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7373 DAG.getConstant(10, MVT::i32)); 7374 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7375 7376 // This is storing the opcode for MOV32ri. 7377 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7378 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7379 OutChains[0] = DAG.getStore(Root, dl, 7380 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7381 Trmp, TrmpAddr, 0, false, false, 0); 7382 7383 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7384 DAG.getConstant(1, MVT::i32)); 7385 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7386 false, false, 1); 7387 7388 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7389 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7390 DAG.getConstant(5, MVT::i32)); 7391 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7392 TrmpAddr, 5, false, false, 1); 7393 7394 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7395 DAG.getConstant(6, MVT::i32)); 7396 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7397 false, false, 1); 7398 7399 SDValue Ops[] = 7400 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7401 return DAG.getMergeValues(Ops, 2, dl); 7402 } 7403} 7404 7405SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7406 SelectionDAG &DAG) const { 7407 /* 7408 The rounding mode is in bits 11:10 of FPSR, and has the following 7409 settings: 7410 00 Round to nearest 7411 01 Round to -inf 7412 10 Round to +inf 7413 11 Round to 0 7414 7415 FLT_ROUNDS, on the other hand, expects the following: 7416 -1 Undefined 7417 0 Round to 0 7418 1 Round to nearest 7419 2 Round to +inf 7420 3 Round to -inf 7421 7422 To perform the conversion, we do: 7423 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7424 */ 7425 7426 MachineFunction &MF = DAG.getMachineFunction(); 7427 const TargetMachine &TM = MF.getTarget(); 7428 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7429 unsigned StackAlignment = TFI.getStackAlignment(); 7430 EVT VT = Op.getValueType(); 7431 DebugLoc dl = Op.getDebugLoc(); 7432 7433 // Save FP Control Word to stack slot 7434 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7435 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7436 7437 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7438 DAG.getEntryNode(), StackSlot); 7439 7440 // Load FP Control Word from stack slot 7441 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7442 false, false, 0); 7443 7444 // Transform as necessary 7445 SDValue CWD1 = 7446 DAG.getNode(ISD::SRL, dl, MVT::i16, 7447 DAG.getNode(ISD::AND, dl, MVT::i16, 7448 CWD, DAG.getConstant(0x800, MVT::i16)), 7449 DAG.getConstant(11, MVT::i8)); 7450 SDValue CWD2 = 7451 DAG.getNode(ISD::SRL, dl, MVT::i16, 7452 DAG.getNode(ISD::AND, dl, MVT::i16, 7453 CWD, DAG.getConstant(0x400, MVT::i16)), 7454 DAG.getConstant(9, MVT::i8)); 7455 7456 SDValue RetVal = 7457 DAG.getNode(ISD::AND, dl, MVT::i16, 7458 DAG.getNode(ISD::ADD, dl, MVT::i16, 7459 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7460 DAG.getConstant(1, MVT::i16)), 7461 DAG.getConstant(3, MVT::i16)); 7462 7463 7464 return DAG.getNode((VT.getSizeInBits() < 16 ? 7465 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7466} 7467 7468SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 7469 EVT VT = Op.getValueType(); 7470 EVT OpVT = VT; 7471 unsigned NumBits = VT.getSizeInBits(); 7472 DebugLoc dl = Op.getDebugLoc(); 7473 7474 Op = Op.getOperand(0); 7475 if (VT == MVT::i8) { 7476 // Zero extend to i32 since there is not an i8 bsr. 7477 OpVT = MVT::i32; 7478 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7479 } 7480 7481 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7482 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7483 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7484 7485 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7486 SDValue Ops[] = { 7487 Op, 7488 DAG.getConstant(NumBits+NumBits-1, OpVT), 7489 DAG.getConstant(X86::COND_E, MVT::i8), 7490 Op.getValue(1) 7491 }; 7492 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7493 7494 // Finally xor with NumBits-1. 7495 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7496 7497 if (VT == MVT::i8) 7498 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7499 return Op; 7500} 7501 7502SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 7503 EVT VT = Op.getValueType(); 7504 EVT OpVT = VT; 7505 unsigned NumBits = VT.getSizeInBits(); 7506 DebugLoc dl = Op.getDebugLoc(); 7507 7508 Op = Op.getOperand(0); 7509 if (VT == MVT::i8) { 7510 OpVT = MVT::i32; 7511 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7512 } 7513 7514 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7515 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7516 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7517 7518 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7519 SDValue Ops[] = { 7520 Op, 7521 DAG.getConstant(NumBits, OpVT), 7522 DAG.getConstant(X86::COND_E, MVT::i8), 7523 Op.getValue(1) 7524 }; 7525 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7526 7527 if (VT == MVT::i8) 7528 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7529 return Op; 7530} 7531 7532SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 7533 EVT VT = Op.getValueType(); 7534 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7535 DebugLoc dl = Op.getDebugLoc(); 7536 7537 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7538 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7539 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7540 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7541 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7542 // 7543 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7544 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7545 // return AloBlo + AloBhi + AhiBlo; 7546 7547 SDValue A = Op.getOperand(0); 7548 SDValue B = Op.getOperand(1); 7549 7550 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7551 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7552 A, DAG.getConstant(32, MVT::i32)); 7553 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7554 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7555 B, DAG.getConstant(32, MVT::i32)); 7556 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7557 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7558 A, B); 7559 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7560 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7561 A, Bhi); 7562 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7563 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7564 Ahi, B); 7565 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7566 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7567 AloBhi, DAG.getConstant(32, MVT::i32)); 7568 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7569 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7570 AhiBlo, DAG.getConstant(32, MVT::i32)); 7571 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7572 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7573 return Res; 7574} 7575 7576SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 7577 EVT VT = Op.getValueType(); 7578 DebugLoc dl = Op.getDebugLoc(); 7579 SDValue R = Op.getOperand(0); 7580 7581 LLVMContext *Context = DAG.getContext(); 7582 7583 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 7584 7585 if (VT == MVT::v4i32) { 7586 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7587 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 7588 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 7589 7590 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 7591 7592 std::vector<Constant*> CV(4, CI); 7593 Constant *C = ConstantVector::get(CV); 7594 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7595 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7596 PseudoSourceValue::getConstantPool(), 0, 7597 false, false, 16); 7598 7599 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 7600 Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); 7601 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 7602 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 7603 } 7604 if (VT == MVT::v16i8) { 7605 // a = a << 5; 7606 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7607 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 7608 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 7609 7610 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 7611 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 7612 7613 std::vector<Constant*> CVM1(16, CM1); 7614 std::vector<Constant*> CVM2(16, CM2); 7615 Constant *C = ConstantVector::get(CVM1); 7616 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7617 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7618 PseudoSourceValue::getConstantPool(), 0, 7619 false, false, 16); 7620 7621 // r = pblendv(r, psllw(r & (char16)15, 4), a); 7622 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7623 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7624 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7625 DAG.getConstant(4, MVT::i32)); 7626 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7627 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7628 R, M, Op); 7629 // a += a 7630 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7631 7632 C = ConstantVector::get(CVM2); 7633 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7634 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7635 PseudoSourceValue::getConstantPool(), 0, false, false, 16); 7636 7637 // r = pblendv(r, psllw(r & (char16)63, 2), a); 7638 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7639 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7640 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7641 DAG.getConstant(2, MVT::i32)); 7642 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7643 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7644 R, M, Op); 7645 // a += a 7646 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7647 7648 // return pblendv(r, r+r, a); 7649 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7650 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7651 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 7652 return R; 7653 } 7654 return SDValue(); 7655} 7656 7657SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 7658 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7659 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7660 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7661 // has only one use. 7662 SDNode *N = Op.getNode(); 7663 SDValue LHS = N->getOperand(0); 7664 SDValue RHS = N->getOperand(1); 7665 unsigned BaseOp = 0; 7666 unsigned Cond = 0; 7667 DebugLoc dl = Op.getDebugLoc(); 7668 7669 switch (Op.getOpcode()) { 7670 default: llvm_unreachable("Unknown ovf instruction!"); 7671 case ISD::SADDO: 7672 // A subtract of one will be selected as a INC. Note that INC doesn't 7673 // set CF, so we can't do this for UADDO. 7674 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7675 if (C->getAPIntValue() == 1) { 7676 BaseOp = X86ISD::INC; 7677 Cond = X86::COND_O; 7678 break; 7679 } 7680 BaseOp = X86ISD::ADD; 7681 Cond = X86::COND_O; 7682 break; 7683 case ISD::UADDO: 7684 BaseOp = X86ISD::ADD; 7685 Cond = X86::COND_B; 7686 break; 7687 case ISD::SSUBO: 7688 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7689 // set CF, so we can't do this for USUBO. 7690 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7691 if (C->getAPIntValue() == 1) { 7692 BaseOp = X86ISD::DEC; 7693 Cond = X86::COND_O; 7694 break; 7695 } 7696 BaseOp = X86ISD::SUB; 7697 Cond = X86::COND_O; 7698 break; 7699 case ISD::USUBO: 7700 BaseOp = X86ISD::SUB; 7701 Cond = X86::COND_B; 7702 break; 7703 case ISD::SMULO: 7704 BaseOp = X86ISD::SMUL; 7705 Cond = X86::COND_O; 7706 break; 7707 case ISD::UMULO: 7708 BaseOp = X86ISD::UMUL; 7709 Cond = X86::COND_B; 7710 break; 7711 } 7712 7713 // Also sets EFLAGS. 7714 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7715 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7716 7717 SDValue SetCC = 7718 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7719 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7720 7721 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7722 return Sum; 7723} 7724 7725SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 7726 DebugLoc dl = Op.getDebugLoc(); 7727 7728 if (!Subtarget->hasSSE2()) { 7729 SDValue Chain = Op.getOperand(0); 7730 SDValue Zero = DAG.getConstant(0, 7731 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7732 SDValue Ops[] = { 7733 DAG.getRegister(X86::ESP, MVT::i32), // Base 7734 DAG.getTargetConstant(1, MVT::i8), // Scale 7735 DAG.getRegister(0, MVT::i32), // Index 7736 DAG.getTargetConstant(0, MVT::i32), // Disp 7737 DAG.getRegister(0, MVT::i32), // Segment. 7738 Zero, 7739 Chain 7740 }; 7741 SDNode *Res = 7742 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 7743 array_lengthof(Ops)); 7744 return SDValue(Res, 0); 7745 } 7746 7747 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 7748 if (!isDev) 7749 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 7750 7751 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7752 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 7753 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 7754 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 7755 7756 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 7757 if (!Op1 && !Op2 && !Op3 && Op4) 7758 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 7759 7760 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 7761 if (Op1 && !Op2 && !Op3 && !Op4) 7762 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 7763 7764 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 7765 // (MFENCE)>; 7766 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 7767} 7768 7769SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 7770 EVT T = Op.getValueType(); 7771 DebugLoc dl = Op.getDebugLoc(); 7772 unsigned Reg = 0; 7773 unsigned size = 0; 7774 switch(T.getSimpleVT().SimpleTy) { 7775 default: 7776 assert(false && "Invalid value type!"); 7777 case MVT::i8: Reg = X86::AL; size = 1; break; 7778 case MVT::i16: Reg = X86::AX; size = 2; break; 7779 case MVT::i32: Reg = X86::EAX; size = 4; break; 7780 case MVT::i64: 7781 assert(Subtarget->is64Bit() && "Node not type legal!"); 7782 Reg = X86::RAX; size = 8; 7783 break; 7784 } 7785 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7786 Op.getOperand(2), SDValue()); 7787 SDValue Ops[] = { cpIn.getValue(0), 7788 Op.getOperand(1), 7789 Op.getOperand(3), 7790 DAG.getTargetConstant(size, MVT::i8), 7791 cpIn.getValue(1) }; 7792 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7793 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7794 SDValue cpOut = 7795 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7796 return cpOut; 7797} 7798 7799SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7800 SelectionDAG &DAG) const { 7801 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7802 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7803 SDValue TheChain = Op.getOperand(0); 7804 DebugLoc dl = Op.getDebugLoc(); 7805 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7806 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7807 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7808 rax.getValue(2)); 7809 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7810 DAG.getConstant(32, MVT::i8)); 7811 SDValue Ops[] = { 7812 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7813 rdx.getValue(1) 7814 }; 7815 return DAG.getMergeValues(Ops, 2, dl); 7816} 7817 7818SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 7819 SelectionDAG &DAG) const { 7820 EVT SrcVT = Op.getOperand(0).getValueType(); 7821 EVT DstVT = Op.getValueType(); 7822 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 7823 Subtarget->hasMMX() && !DisableMMX) && 7824 "Unexpected custom BIT_CONVERT"); 7825 assert((DstVT == MVT::i64 || 7826 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 7827 "Unexpected custom BIT_CONVERT"); 7828 // i64 <=> MMX conversions are Legal. 7829 if (SrcVT==MVT::i64 && DstVT.isVector()) 7830 return Op; 7831 if (DstVT==MVT::i64 && SrcVT.isVector()) 7832 return Op; 7833 // MMX <=> MMX conversions are Legal. 7834 if (SrcVT.isVector() && DstVT.isVector()) 7835 return Op; 7836 // All other conversions need to be expanded. 7837 return SDValue(); 7838} 7839SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 7840 SDNode *Node = Op.getNode(); 7841 DebugLoc dl = Node->getDebugLoc(); 7842 EVT T = Node->getValueType(0); 7843 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7844 DAG.getConstant(0, T), Node->getOperand(2)); 7845 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7846 cast<AtomicSDNode>(Node)->getMemoryVT(), 7847 Node->getOperand(0), 7848 Node->getOperand(1), negOp, 7849 cast<AtomicSDNode>(Node)->getSrcValue(), 7850 cast<AtomicSDNode>(Node)->getAlignment()); 7851} 7852 7853/// LowerOperation - Provide custom lowering hooks for some operations. 7854/// 7855SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7856 switch (Op.getOpcode()) { 7857 default: llvm_unreachable("Should not custom lower this!"); 7858 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 7859 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7860 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7861 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7862 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7863 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7864 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7865 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7866 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7867 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7868 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7869 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7870 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7871 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7872 case ISD::SHL_PARTS: 7873 case ISD::SRA_PARTS: 7874 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7875 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7876 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7877 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7878 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7879 case ISD::FABS: return LowerFABS(Op, DAG); 7880 case ISD::FNEG: return LowerFNEG(Op, DAG); 7881 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7882 case ISD::SETCC: return LowerSETCC(Op, DAG); 7883 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7884 case ISD::SELECT: return LowerSELECT(Op, DAG); 7885 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7886 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7887 case ISD::VASTART: return LowerVASTART(Op, DAG); 7888 case ISD::VAARG: return LowerVAARG(Op, DAG); 7889 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7890 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7891 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7892 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7893 case ISD::FRAME_TO_ARGS_OFFSET: 7894 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7895 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7896 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7897 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7898 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7899 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7900 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7901 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7902 case ISD::SHL: return LowerSHL(Op, DAG); 7903 case ISD::SADDO: 7904 case ISD::UADDO: 7905 case ISD::SSUBO: 7906 case ISD::USUBO: 7907 case ISD::SMULO: 7908 case ISD::UMULO: return LowerXALUO(Op, DAG); 7909 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7910 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 7911 } 7912} 7913 7914void X86TargetLowering:: 7915ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7916 SelectionDAG &DAG, unsigned NewOp) const { 7917 EVT T = Node->getValueType(0); 7918 DebugLoc dl = Node->getDebugLoc(); 7919 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7920 7921 SDValue Chain = Node->getOperand(0); 7922 SDValue In1 = Node->getOperand(1); 7923 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7924 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7925 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7926 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7927 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7928 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7929 SDValue Result = 7930 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7931 cast<MemSDNode>(Node)->getMemOperand()); 7932 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7933 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7934 Results.push_back(Result.getValue(2)); 7935} 7936 7937/// ReplaceNodeResults - Replace a node with an illegal result type 7938/// with a new node built out of custom code. 7939void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7940 SmallVectorImpl<SDValue>&Results, 7941 SelectionDAG &DAG) const { 7942 DebugLoc dl = N->getDebugLoc(); 7943 switch (N->getOpcode()) { 7944 default: 7945 assert(false && "Do not know how to custom type legalize this operation!"); 7946 return; 7947 case ISD::FP_TO_SINT: { 7948 std::pair<SDValue,SDValue> Vals = 7949 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7950 SDValue FIST = Vals.first, StackSlot = Vals.second; 7951 if (FIST.getNode() != 0) { 7952 EVT VT = N->getValueType(0); 7953 // Return a load from the stack slot. 7954 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 7955 false, false, 0)); 7956 } 7957 return; 7958 } 7959 case ISD::READCYCLECOUNTER: { 7960 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7961 SDValue TheChain = N->getOperand(0); 7962 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7963 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7964 rd.getValue(1)); 7965 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7966 eax.getValue(2)); 7967 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7968 SDValue Ops[] = { eax, edx }; 7969 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7970 Results.push_back(edx.getValue(1)); 7971 return; 7972 } 7973 case ISD::ATOMIC_CMP_SWAP: { 7974 EVT T = N->getValueType(0); 7975 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7976 SDValue cpInL, cpInH; 7977 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7978 DAG.getConstant(0, MVT::i32)); 7979 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7980 DAG.getConstant(1, MVT::i32)); 7981 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7982 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7983 cpInL.getValue(1)); 7984 SDValue swapInL, swapInH; 7985 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7986 DAG.getConstant(0, MVT::i32)); 7987 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7988 DAG.getConstant(1, MVT::i32)); 7989 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7990 cpInH.getValue(1)); 7991 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7992 swapInL.getValue(1)); 7993 SDValue Ops[] = { swapInH.getValue(0), 7994 N->getOperand(1), 7995 swapInH.getValue(1) }; 7996 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7997 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7998 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7999 MVT::i32, Result.getValue(1)); 8000 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 8001 MVT::i32, cpOutL.getValue(2)); 8002 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 8003 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8004 Results.push_back(cpOutH.getValue(1)); 8005 return; 8006 } 8007 case ISD::ATOMIC_LOAD_ADD: 8008 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 8009 return; 8010 case ISD::ATOMIC_LOAD_AND: 8011 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 8012 return; 8013 case ISD::ATOMIC_LOAD_NAND: 8014 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 8015 return; 8016 case ISD::ATOMIC_LOAD_OR: 8017 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 8018 return; 8019 case ISD::ATOMIC_LOAD_SUB: 8020 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8021 return; 8022 case ISD::ATOMIC_LOAD_XOR: 8023 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8024 return; 8025 case ISD::ATOMIC_SWAP: 8026 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8027 return; 8028 } 8029} 8030 8031const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8032 switch (Opcode) { 8033 default: return NULL; 8034 case X86ISD::BSF: return "X86ISD::BSF"; 8035 case X86ISD::BSR: return "X86ISD::BSR"; 8036 case X86ISD::SHLD: return "X86ISD::SHLD"; 8037 case X86ISD::SHRD: return "X86ISD::SHRD"; 8038 case X86ISD::FAND: return "X86ISD::FAND"; 8039 case X86ISD::FOR: return "X86ISD::FOR"; 8040 case X86ISD::FXOR: return "X86ISD::FXOR"; 8041 case X86ISD::FSRL: return "X86ISD::FSRL"; 8042 case X86ISD::FILD: return "X86ISD::FILD"; 8043 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8044 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8045 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8046 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8047 case X86ISD::FLD: return "X86ISD::FLD"; 8048 case X86ISD::FST: return "X86ISD::FST"; 8049 case X86ISD::CALL: return "X86ISD::CALL"; 8050 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8051 case X86ISD::BT: return "X86ISD::BT"; 8052 case X86ISD::CMP: return "X86ISD::CMP"; 8053 case X86ISD::COMI: return "X86ISD::COMI"; 8054 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8055 case X86ISD::SETCC: return "X86ISD::SETCC"; 8056 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8057 case X86ISD::CMOV: return "X86ISD::CMOV"; 8058 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8059 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8060 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8061 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8062 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8063 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8064 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8065 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8066 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8067 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8068 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8069 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8070 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 8071 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8072 case X86ISD::FMAX: return "X86ISD::FMAX"; 8073 case X86ISD::FMIN: return "X86ISD::FMIN"; 8074 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8075 case X86ISD::FRCP: return "X86ISD::FRCP"; 8076 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8077 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8078 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 8079 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8080 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8081 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8082 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8083 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8084 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8085 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8086 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8087 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8088 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8089 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8090 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8091 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8092 case X86ISD::VSHL: return "X86ISD::VSHL"; 8093 case X86ISD::VSRL: return "X86ISD::VSRL"; 8094 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8095 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8096 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8097 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8098 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8099 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8100 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8101 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8102 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8103 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8104 case X86ISD::ADD: return "X86ISD::ADD"; 8105 case X86ISD::SUB: return "X86ISD::SUB"; 8106 case X86ISD::SMUL: return "X86ISD::SMUL"; 8107 case X86ISD::UMUL: return "X86ISD::UMUL"; 8108 case X86ISD::INC: return "X86ISD::INC"; 8109 case X86ISD::DEC: return "X86ISD::DEC"; 8110 case X86ISD::OR: return "X86ISD::OR"; 8111 case X86ISD::XOR: return "X86ISD::XOR"; 8112 case X86ISD::AND: return "X86ISD::AND"; 8113 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8114 case X86ISD::PTEST: return "X86ISD::PTEST"; 8115 case X86ISD::TESTP: return "X86ISD::TESTP"; 8116 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 8117 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 8118 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 8119 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 8120 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 8121 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 8122 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 8123 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 8124 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 8125 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 8126 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 8127 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 8128 case X86ISD::MOVHPS: return "X86ISD::MOVHPS"; 8129 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 8130 case X86ISD::MOVHPD: return "X86ISD::MOVHPD"; 8131 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 8132 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 8133 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 8134 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 8135 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 8136 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 8137 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 8138 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 8139 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 8140 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 8141 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 8142 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 8143 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 8144 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 8145 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 8146 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 8147 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 8148 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 8149 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 8150 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 8151 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 8152 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 8153 } 8154} 8155 8156// isLegalAddressingMode - Return true if the addressing mode represented 8157// by AM is legal for this target, for a load/store of the specified type. 8158bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 8159 const Type *Ty) const { 8160 // X86 supports extremely general addressing modes. 8161 CodeModel::Model M = getTargetMachine().getCodeModel(); 8162 8163 // X86 allows a sign-extended 32-bit immediate field as a displacement. 8164 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 8165 return false; 8166 8167 if (AM.BaseGV) { 8168 unsigned GVFlags = 8169 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 8170 8171 // If a reference to this global requires an extra load, we can't fold it. 8172 if (isGlobalStubReference(GVFlags)) 8173 return false; 8174 8175 // If BaseGV requires a register for the PIC base, we cannot also have a 8176 // BaseReg specified. 8177 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 8178 return false; 8179 8180 // If lower 4G is not available, then we must use rip-relative addressing. 8181 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 8182 return false; 8183 } 8184 8185 switch (AM.Scale) { 8186 case 0: 8187 case 1: 8188 case 2: 8189 case 4: 8190 case 8: 8191 // These scales always work. 8192 break; 8193 case 3: 8194 case 5: 8195 case 9: 8196 // These scales are formed with basereg+scalereg. Only accept if there is 8197 // no basereg yet. 8198 if (AM.HasBaseReg) 8199 return false; 8200 break; 8201 default: // Other stuff never works. 8202 return false; 8203 } 8204 8205 return true; 8206} 8207 8208 8209bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 8210 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8211 return false; 8212 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8213 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8214 if (NumBits1 <= NumBits2) 8215 return false; 8216 return true; 8217} 8218 8219bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 8220 if (!VT1.isInteger() || !VT2.isInteger()) 8221 return false; 8222 unsigned NumBits1 = VT1.getSizeInBits(); 8223 unsigned NumBits2 = VT2.getSizeInBits(); 8224 if (NumBits1 <= NumBits2) 8225 return false; 8226 return true; 8227} 8228 8229bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 8230 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8231 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 8232} 8233 8234bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 8235 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8236 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 8237} 8238 8239bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 8240 // i16 instructions are longer (0x66 prefix) and potentially slower. 8241 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 8242} 8243 8244/// isShuffleMaskLegal - Targets can use this to indicate that they only 8245/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8246/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8247/// are assumed to be legal. 8248bool 8249X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 8250 EVT VT) const { 8251 // Very little shuffling can be done for 64-bit vectors right now. 8252 if (VT.getSizeInBits() == 64) 8253 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 8254 8255 // FIXME: pshufb, blends, shifts. 8256 return (VT.getVectorNumElements() == 2 || 8257 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8258 isMOVLMask(M, VT) || 8259 isSHUFPMask(M, VT) || 8260 isPSHUFDMask(M, VT) || 8261 isPSHUFHWMask(M, VT) || 8262 isPSHUFLWMask(M, VT) || 8263 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 8264 isUNPCKLMask(M, VT) || 8265 isUNPCKHMask(M, VT) || 8266 isUNPCKL_v_undef_Mask(M, VT) || 8267 isUNPCKH_v_undef_Mask(M, VT)); 8268} 8269 8270bool 8271X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 8272 EVT VT) const { 8273 unsigned NumElts = VT.getVectorNumElements(); 8274 // FIXME: This collection of masks seems suspect. 8275 if (NumElts == 2) 8276 return true; 8277 if (NumElts == 4 && VT.getSizeInBits() == 128) { 8278 return (isMOVLMask(Mask, VT) || 8279 isCommutedMOVLMask(Mask, VT, true) || 8280 isSHUFPMask(Mask, VT) || 8281 isCommutedSHUFPMask(Mask, VT)); 8282 } 8283 return false; 8284} 8285 8286//===----------------------------------------------------------------------===// 8287// X86 Scheduler Hooks 8288//===----------------------------------------------------------------------===// 8289 8290// private utility function 8291MachineBasicBlock * 8292X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 8293 MachineBasicBlock *MBB, 8294 unsigned regOpc, 8295 unsigned immOpc, 8296 unsigned LoadOpc, 8297 unsigned CXchgOpc, 8298 unsigned notOpc, 8299 unsigned EAXreg, 8300 TargetRegisterClass *RC, 8301 bool invSrc) const { 8302 // For the atomic bitwise operator, we generate 8303 // thisMBB: 8304 // newMBB: 8305 // ld t1 = [bitinstr.addr] 8306 // op t2 = t1, [bitinstr.val] 8307 // mov EAX = t1 8308 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8309 // bz newMBB 8310 // fallthrough -->nextMBB 8311 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8312 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8313 MachineFunction::iterator MBBIter = MBB; 8314 ++MBBIter; 8315 8316 /// First build the CFG 8317 MachineFunction *F = MBB->getParent(); 8318 MachineBasicBlock *thisMBB = MBB; 8319 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8320 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8321 F->insert(MBBIter, newMBB); 8322 F->insert(MBBIter, nextMBB); 8323 8324 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8325 nextMBB->splice(nextMBB->begin(), thisMBB, 8326 llvm::next(MachineBasicBlock::iterator(bInstr)), 8327 thisMBB->end()); 8328 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8329 8330 // Update thisMBB to fall through to newMBB 8331 thisMBB->addSuccessor(newMBB); 8332 8333 // newMBB jumps to itself and fall through to nextMBB 8334 newMBB->addSuccessor(nextMBB); 8335 newMBB->addSuccessor(newMBB); 8336 8337 // Insert instructions into newMBB based on incoming instruction 8338 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8339 "unexpected number of operands"); 8340 DebugLoc dl = bInstr->getDebugLoc(); 8341 MachineOperand& destOper = bInstr->getOperand(0); 8342 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8343 int numArgs = bInstr->getNumOperands() - 1; 8344 for (int i=0; i < numArgs; ++i) 8345 argOpers[i] = &bInstr->getOperand(i+1); 8346 8347 // x86 address has 4 operands: base, index, scale, and displacement 8348 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8349 int valArgIndx = lastAddrIndx + 1; 8350 8351 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8352 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 8353 for (int i=0; i <= lastAddrIndx; ++i) 8354 (*MIB).addOperand(*argOpers[i]); 8355 8356 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 8357 if (invSrc) { 8358 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 8359 } 8360 else 8361 tt = t1; 8362 8363 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8364 assert((argOpers[valArgIndx]->isReg() || 8365 argOpers[valArgIndx]->isImm()) && 8366 "invalid operand"); 8367 if (argOpers[valArgIndx]->isReg()) 8368 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 8369 else 8370 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 8371 MIB.addReg(tt); 8372 (*MIB).addOperand(*argOpers[valArgIndx]); 8373 8374 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 8375 MIB.addReg(t1); 8376 8377 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 8378 for (int i=0; i <= lastAddrIndx; ++i) 8379 (*MIB).addOperand(*argOpers[i]); 8380 MIB.addReg(t2); 8381 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8382 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8383 bInstr->memoperands_end()); 8384 8385 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8386 MIB.addReg(EAXreg); 8387 8388 // insert branch 8389 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8390 8391 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8392 return nextMBB; 8393} 8394 8395// private utility function: 64 bit atomics on 32 bit host. 8396MachineBasicBlock * 8397X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8398 MachineBasicBlock *MBB, 8399 unsigned regOpcL, 8400 unsigned regOpcH, 8401 unsigned immOpcL, 8402 unsigned immOpcH, 8403 bool invSrc) const { 8404 // For the atomic bitwise operator, we generate 8405 // thisMBB (instructions are in pairs, except cmpxchg8b) 8406 // ld t1,t2 = [bitinstr.addr] 8407 // newMBB: 8408 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8409 // op t5, t6 <- out1, out2, [bitinstr.val] 8410 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8411 // mov ECX, EBX <- t5, t6 8412 // mov EAX, EDX <- t1, t2 8413 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8414 // mov t3, t4 <- EAX, EDX 8415 // bz newMBB 8416 // result in out1, out2 8417 // fallthrough -->nextMBB 8418 8419 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8420 const unsigned LoadOpc = X86::MOV32rm; 8421 const unsigned NotOpc = X86::NOT32r; 8422 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8423 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8424 MachineFunction::iterator MBBIter = MBB; 8425 ++MBBIter; 8426 8427 /// First build the CFG 8428 MachineFunction *F = MBB->getParent(); 8429 MachineBasicBlock *thisMBB = MBB; 8430 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8431 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8432 F->insert(MBBIter, newMBB); 8433 F->insert(MBBIter, nextMBB); 8434 8435 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8436 nextMBB->splice(nextMBB->begin(), thisMBB, 8437 llvm::next(MachineBasicBlock::iterator(bInstr)), 8438 thisMBB->end()); 8439 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8440 8441 // Update thisMBB to fall through to newMBB 8442 thisMBB->addSuccessor(newMBB); 8443 8444 // newMBB jumps to itself and fall through to nextMBB 8445 newMBB->addSuccessor(nextMBB); 8446 newMBB->addSuccessor(newMBB); 8447 8448 DebugLoc dl = bInstr->getDebugLoc(); 8449 // Insert instructions into newMBB based on incoming instruction 8450 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8451 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 8452 "unexpected number of operands"); 8453 MachineOperand& dest1Oper = bInstr->getOperand(0); 8454 MachineOperand& dest2Oper = bInstr->getOperand(1); 8455 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8456 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 8457 argOpers[i] = &bInstr->getOperand(i+2); 8458 8459 // We use some of the operands multiple times, so conservatively just 8460 // clear any kill flags that might be present. 8461 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 8462 argOpers[i]->setIsKill(false); 8463 } 8464 8465 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8466 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8467 8468 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8469 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8470 for (int i=0; i <= lastAddrIndx; ++i) 8471 (*MIB).addOperand(*argOpers[i]); 8472 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8473 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8474 // add 4 to displacement. 8475 for (int i=0; i <= lastAddrIndx-2; ++i) 8476 (*MIB).addOperand(*argOpers[i]); 8477 MachineOperand newOp3 = *(argOpers[3]); 8478 if (newOp3.isImm()) 8479 newOp3.setImm(newOp3.getImm()+4); 8480 else 8481 newOp3.setOffset(newOp3.getOffset()+4); 8482 (*MIB).addOperand(newOp3); 8483 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8484 8485 // t3/4 are defined later, at the bottom of the loop 8486 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8487 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8488 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8489 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8490 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8491 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8492 8493 // The subsequent operations should be using the destination registers of 8494 //the PHI instructions. 8495 if (invSrc) { 8496 t1 = F->getRegInfo().createVirtualRegister(RC); 8497 t2 = F->getRegInfo().createVirtualRegister(RC); 8498 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8499 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8500 } else { 8501 t1 = dest1Oper.getReg(); 8502 t2 = dest2Oper.getReg(); 8503 } 8504 8505 int valArgIndx = lastAddrIndx + 1; 8506 assert((argOpers[valArgIndx]->isReg() || 8507 argOpers[valArgIndx]->isImm()) && 8508 "invalid operand"); 8509 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8510 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8511 if (argOpers[valArgIndx]->isReg()) 8512 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8513 else 8514 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8515 if (regOpcL != X86::MOV32rr) 8516 MIB.addReg(t1); 8517 (*MIB).addOperand(*argOpers[valArgIndx]); 8518 assert(argOpers[valArgIndx + 1]->isReg() == 8519 argOpers[valArgIndx]->isReg()); 8520 assert(argOpers[valArgIndx + 1]->isImm() == 8521 argOpers[valArgIndx]->isImm()); 8522 if (argOpers[valArgIndx + 1]->isReg()) 8523 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8524 else 8525 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8526 if (regOpcH != X86::MOV32rr) 8527 MIB.addReg(t2); 8528 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8529 8530 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8531 MIB.addReg(t1); 8532 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 8533 MIB.addReg(t2); 8534 8535 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 8536 MIB.addReg(t5); 8537 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 8538 MIB.addReg(t6); 8539 8540 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8541 for (int i=0; i <= lastAddrIndx; ++i) 8542 (*MIB).addOperand(*argOpers[i]); 8543 8544 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8545 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8546 bInstr->memoperands_end()); 8547 8548 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 8549 MIB.addReg(X86::EAX); 8550 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 8551 MIB.addReg(X86::EDX); 8552 8553 // insert branch 8554 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8555 8556 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8557 return nextMBB; 8558} 8559 8560// private utility function 8561MachineBasicBlock * 8562X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8563 MachineBasicBlock *MBB, 8564 unsigned cmovOpc) const { 8565 // For the atomic min/max operator, we generate 8566 // thisMBB: 8567 // newMBB: 8568 // ld t1 = [min/max.addr] 8569 // mov t2 = [min/max.val] 8570 // cmp t1, t2 8571 // cmov[cond] t2 = t1 8572 // mov EAX = t1 8573 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8574 // bz newMBB 8575 // fallthrough -->nextMBB 8576 // 8577 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8578 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8579 MachineFunction::iterator MBBIter = MBB; 8580 ++MBBIter; 8581 8582 /// First build the CFG 8583 MachineFunction *F = MBB->getParent(); 8584 MachineBasicBlock *thisMBB = MBB; 8585 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8586 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8587 F->insert(MBBIter, newMBB); 8588 F->insert(MBBIter, nextMBB); 8589 8590 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8591 nextMBB->splice(nextMBB->begin(), thisMBB, 8592 llvm::next(MachineBasicBlock::iterator(mInstr)), 8593 thisMBB->end()); 8594 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8595 8596 // Update thisMBB to fall through to newMBB 8597 thisMBB->addSuccessor(newMBB); 8598 8599 // newMBB jumps to newMBB and fall through to nextMBB 8600 newMBB->addSuccessor(nextMBB); 8601 newMBB->addSuccessor(newMBB); 8602 8603 DebugLoc dl = mInstr->getDebugLoc(); 8604 // Insert instructions into newMBB based on incoming instruction 8605 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8606 "unexpected number of operands"); 8607 MachineOperand& destOper = mInstr->getOperand(0); 8608 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8609 int numArgs = mInstr->getNumOperands() - 1; 8610 for (int i=0; i < numArgs; ++i) 8611 argOpers[i] = &mInstr->getOperand(i+1); 8612 8613 // x86 address has 4 operands: base, index, scale, and displacement 8614 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8615 int valArgIndx = lastAddrIndx + 1; 8616 8617 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8618 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8619 for (int i=0; i <= lastAddrIndx; ++i) 8620 (*MIB).addOperand(*argOpers[i]); 8621 8622 // We only support register and immediate values 8623 assert((argOpers[valArgIndx]->isReg() || 8624 argOpers[valArgIndx]->isImm()) && 8625 "invalid operand"); 8626 8627 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8628 if (argOpers[valArgIndx]->isReg()) 8629 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 8630 else 8631 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8632 (*MIB).addOperand(*argOpers[valArgIndx]); 8633 8634 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8635 MIB.addReg(t1); 8636 8637 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8638 MIB.addReg(t1); 8639 MIB.addReg(t2); 8640 8641 // Generate movc 8642 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8643 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8644 MIB.addReg(t2); 8645 MIB.addReg(t1); 8646 8647 // Cmp and exchange if none has modified the memory location 8648 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8649 for (int i=0; i <= lastAddrIndx; ++i) 8650 (*MIB).addOperand(*argOpers[i]); 8651 MIB.addReg(t3); 8652 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8653 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8654 mInstr->memoperands_end()); 8655 8656 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8657 MIB.addReg(X86::EAX); 8658 8659 // insert branch 8660 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8661 8662 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 8663 return nextMBB; 8664} 8665 8666// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8667// or XMM0_V32I8 in AVX all of this code can be replaced with that 8668// in the .td file. 8669MachineBasicBlock * 8670X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8671 unsigned numArgs, bool memArg) const { 8672 8673 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 8674 "Target must have SSE4.2 or AVX features enabled"); 8675 8676 DebugLoc dl = MI->getDebugLoc(); 8677 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8678 8679 unsigned Opc; 8680 8681 if (!Subtarget->hasAVX()) { 8682 if (memArg) 8683 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8684 else 8685 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8686 } else { 8687 if (memArg) 8688 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 8689 else 8690 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 8691 } 8692 8693 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8694 8695 for (unsigned i = 0; i < numArgs; ++i) { 8696 MachineOperand &Op = MI->getOperand(i+1); 8697 8698 if (!(Op.isReg() && Op.isImplicit())) 8699 MIB.addOperand(Op); 8700 } 8701 8702 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8703 .addReg(X86::XMM0); 8704 8705 MI->eraseFromParent(); 8706 8707 return BB; 8708} 8709 8710MachineBasicBlock * 8711X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8712 MachineInstr *MI, 8713 MachineBasicBlock *MBB) const { 8714 // Emit code to save XMM registers to the stack. The ABI says that the 8715 // number of registers to save is given in %al, so it's theoretically 8716 // possible to do an indirect jump trick to avoid saving all of them, 8717 // however this code takes a simpler approach and just executes all 8718 // of the stores if %al is non-zero. It's less code, and it's probably 8719 // easier on the hardware branch predictor, and stores aren't all that 8720 // expensive anyway. 8721 8722 // Create the new basic blocks. One block contains all the XMM stores, 8723 // and one block is the final destination regardless of whether any 8724 // stores were performed. 8725 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8726 MachineFunction *F = MBB->getParent(); 8727 MachineFunction::iterator MBBIter = MBB; 8728 ++MBBIter; 8729 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8730 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8731 F->insert(MBBIter, XMMSaveMBB); 8732 F->insert(MBBIter, EndMBB); 8733 8734 // Transfer the remainder of MBB and its successor edges to EndMBB. 8735 EndMBB->splice(EndMBB->begin(), MBB, 8736 llvm::next(MachineBasicBlock::iterator(MI)), 8737 MBB->end()); 8738 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 8739 8740 // The original block will now fall through to the XMM save block. 8741 MBB->addSuccessor(XMMSaveMBB); 8742 // The XMMSaveMBB will fall through to the end block. 8743 XMMSaveMBB->addSuccessor(EndMBB); 8744 8745 // Now add the instructions. 8746 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8747 DebugLoc DL = MI->getDebugLoc(); 8748 8749 unsigned CountReg = MI->getOperand(0).getReg(); 8750 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8751 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8752 8753 if (!Subtarget->isTargetWin64()) { 8754 // If %al is 0, branch around the XMM save block. 8755 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8756 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8757 MBB->addSuccessor(EndMBB); 8758 } 8759 8760 // In the XMM save block, save all the XMM argument registers. 8761 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8762 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8763 MachineMemOperand *MMO = 8764 F->getMachineMemOperand( 8765 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8766 MachineMemOperand::MOStore, Offset, 8767 /*Size=*/16, /*Align=*/16); 8768 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8769 .addFrameIndex(RegSaveFrameIndex) 8770 .addImm(/*Scale=*/1) 8771 .addReg(/*IndexReg=*/0) 8772 .addImm(/*Disp=*/Offset) 8773 .addReg(/*Segment=*/0) 8774 .addReg(MI->getOperand(i).getReg()) 8775 .addMemOperand(MMO); 8776 } 8777 8778 MI->eraseFromParent(); // The pseudo instruction is gone now. 8779 8780 return EndMBB; 8781} 8782 8783MachineBasicBlock * 8784X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8785 MachineBasicBlock *BB) const { 8786 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8787 DebugLoc DL = MI->getDebugLoc(); 8788 8789 // To "insert" a SELECT_CC instruction, we actually have to insert the 8790 // diamond control-flow pattern. The incoming instruction knows the 8791 // destination vreg to set, the condition code register to branch on, the 8792 // true/false values to select between, and a branch opcode to use. 8793 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8794 MachineFunction::iterator It = BB; 8795 ++It; 8796 8797 // thisMBB: 8798 // ... 8799 // TrueVal = ... 8800 // cmpTY ccX, r1, r2 8801 // bCC copy1MBB 8802 // fallthrough --> copy0MBB 8803 MachineBasicBlock *thisMBB = BB; 8804 MachineFunction *F = BB->getParent(); 8805 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8806 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8807 F->insert(It, copy0MBB); 8808 F->insert(It, sinkMBB); 8809 8810 // If the EFLAGS register isn't dead in the terminator, then claim that it's 8811 // live into the sink and copy blocks. 8812 const MachineFunction *MF = BB->getParent(); 8813 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 8814 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 8815 8816 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 8817 const MachineOperand &MO = MI->getOperand(I); 8818 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 8819 unsigned Reg = MO.getReg(); 8820 if (Reg != X86::EFLAGS) continue; 8821 copy0MBB->addLiveIn(Reg); 8822 sinkMBB->addLiveIn(Reg); 8823 } 8824 8825 // Transfer the remainder of BB and its successor edges to sinkMBB. 8826 sinkMBB->splice(sinkMBB->begin(), BB, 8827 llvm::next(MachineBasicBlock::iterator(MI)), 8828 BB->end()); 8829 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 8830 8831 // Add the true and fallthrough blocks as its successors. 8832 BB->addSuccessor(copy0MBB); 8833 BB->addSuccessor(sinkMBB); 8834 8835 // Create the conditional branch instruction. 8836 unsigned Opc = 8837 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8838 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8839 8840 // copy0MBB: 8841 // %FalseValue = ... 8842 // # fallthrough to sinkMBB 8843 copy0MBB->addSuccessor(sinkMBB); 8844 8845 // sinkMBB: 8846 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8847 // ... 8848 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 8849 TII->get(X86::PHI), MI->getOperand(0).getReg()) 8850 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8851 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8852 8853 MI->eraseFromParent(); // The pseudo instruction is gone now. 8854 return sinkMBB; 8855} 8856 8857MachineBasicBlock * 8858X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 8859 MachineBasicBlock *BB) const { 8860 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8861 DebugLoc DL = MI->getDebugLoc(); 8862 8863 // The lowering is pretty easy: we're just emitting the call to _alloca. The 8864 // non-trivial part is impdef of ESP. 8865 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 8866 // mingw-w64. 8867 8868 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 8869 .addExternalSymbol("_alloca") 8870 .addReg(X86::EAX, RegState::Implicit) 8871 .addReg(X86::ESP, RegState::Implicit) 8872 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 8873 .addReg(X86::ESP, RegState::Define | RegState::Implicit); 8874 8875 MI->eraseFromParent(); // The pseudo instruction is gone now. 8876 return BB; 8877} 8878 8879MachineBasicBlock * 8880X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 8881 MachineBasicBlock *BB) const { 8882 // This is pretty easy. We're taking the value that we received from 8883 // our load from the relocation, sticking it in either RDI (x86-64) 8884 // or EAX and doing an indirect call. The return value will then 8885 // be in the normal return register. 8886 const X86InstrInfo *TII 8887 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 8888 DebugLoc DL = MI->getDebugLoc(); 8889 MachineFunction *F = BB->getParent(); 8890 bool IsWin64 = Subtarget->isTargetWin64(); 8891 8892 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 8893 8894 if (Subtarget->is64Bit()) { 8895 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8896 TII->get(X86::MOV64rm), X86::RDI) 8897 .addReg(X86::RIP) 8898 .addImm(0).addReg(0) 8899 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8900 MI->getOperand(3).getTargetFlags()) 8901 .addReg(0); 8902 MIB = BuildMI(*BB, MI, DL, TII->get(IsWin64 ? X86::WINCALL64m : X86::CALL64m)); 8903 addDirectMem(MIB, X86::RDI); 8904 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 8905 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8906 TII->get(X86::MOV32rm), X86::EAX) 8907 .addReg(0) 8908 .addImm(0).addReg(0) 8909 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8910 MI->getOperand(3).getTargetFlags()) 8911 .addReg(0); 8912 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8913 addDirectMem(MIB, X86::EAX); 8914 } else { 8915 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8916 TII->get(X86::MOV32rm), X86::EAX) 8917 .addReg(TII->getGlobalBaseReg(F)) 8918 .addImm(0).addReg(0) 8919 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8920 MI->getOperand(3).getTargetFlags()) 8921 .addReg(0); 8922 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8923 addDirectMem(MIB, X86::EAX); 8924 } 8925 8926 MI->eraseFromParent(); // The pseudo instruction is gone now. 8927 return BB; 8928} 8929 8930MachineBasicBlock * 8931X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8932 MachineBasicBlock *BB) const { 8933 switch (MI->getOpcode()) { 8934 default: assert(false && "Unexpected instr type to insert"); 8935 case X86::MINGW_ALLOCA: 8936 return EmitLoweredMingwAlloca(MI, BB); 8937 case X86::TLSCall_32: 8938 case X86::TLSCall_64: 8939 return EmitLoweredTLSCall(MI, BB); 8940 case X86::CMOV_GR8: 8941 case X86::CMOV_V1I64: 8942 case X86::CMOV_FR32: 8943 case X86::CMOV_FR64: 8944 case X86::CMOV_V4F32: 8945 case X86::CMOV_V2F64: 8946 case X86::CMOV_V2I64: 8947 case X86::CMOV_GR16: 8948 case X86::CMOV_GR32: 8949 case X86::CMOV_RFP32: 8950 case X86::CMOV_RFP64: 8951 case X86::CMOV_RFP80: 8952 return EmitLoweredSelect(MI, BB); 8953 8954 case X86::FP32_TO_INT16_IN_MEM: 8955 case X86::FP32_TO_INT32_IN_MEM: 8956 case X86::FP32_TO_INT64_IN_MEM: 8957 case X86::FP64_TO_INT16_IN_MEM: 8958 case X86::FP64_TO_INT32_IN_MEM: 8959 case X86::FP64_TO_INT64_IN_MEM: 8960 case X86::FP80_TO_INT16_IN_MEM: 8961 case X86::FP80_TO_INT32_IN_MEM: 8962 case X86::FP80_TO_INT64_IN_MEM: { 8963 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8964 DebugLoc DL = MI->getDebugLoc(); 8965 8966 // Change the floating point control register to use "round towards zero" 8967 // mode when truncating to an integer value. 8968 MachineFunction *F = BB->getParent(); 8969 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8970 addFrameReference(BuildMI(*BB, MI, DL, 8971 TII->get(X86::FNSTCW16m)), CWFrameIdx); 8972 8973 // Load the old value of the high byte of the control word... 8974 unsigned OldCW = 8975 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8976 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 8977 CWFrameIdx); 8978 8979 // Set the high part to be round to zero... 8980 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8981 .addImm(0xC7F); 8982 8983 // Reload the modified control word now... 8984 addFrameReference(BuildMI(*BB, MI, DL, 8985 TII->get(X86::FLDCW16m)), CWFrameIdx); 8986 8987 // Restore the memory image of control word to original value 8988 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8989 .addReg(OldCW); 8990 8991 // Get the X86 opcode to use. 8992 unsigned Opc; 8993 switch (MI->getOpcode()) { 8994 default: llvm_unreachable("illegal opcode!"); 8995 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8996 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8997 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8998 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8999 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 9000 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 9001 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 9002 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 9003 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 9004 } 9005 9006 X86AddressMode AM; 9007 MachineOperand &Op = MI->getOperand(0); 9008 if (Op.isReg()) { 9009 AM.BaseType = X86AddressMode::RegBase; 9010 AM.Base.Reg = Op.getReg(); 9011 } else { 9012 AM.BaseType = X86AddressMode::FrameIndexBase; 9013 AM.Base.FrameIndex = Op.getIndex(); 9014 } 9015 Op = MI->getOperand(1); 9016 if (Op.isImm()) 9017 AM.Scale = Op.getImm(); 9018 Op = MI->getOperand(2); 9019 if (Op.isImm()) 9020 AM.IndexReg = Op.getImm(); 9021 Op = MI->getOperand(3); 9022 if (Op.isGlobal()) { 9023 AM.GV = Op.getGlobal(); 9024 } else { 9025 AM.Disp = Op.getImm(); 9026 } 9027 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 9028 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 9029 9030 // Reload the original control word now. 9031 addFrameReference(BuildMI(*BB, MI, DL, 9032 TII->get(X86::FLDCW16m)), CWFrameIdx); 9033 9034 MI->eraseFromParent(); // The pseudo instruction is gone now. 9035 return BB; 9036 } 9037 // String/text processing lowering. 9038 case X86::PCMPISTRM128REG: 9039 case X86::VPCMPISTRM128REG: 9040 return EmitPCMP(MI, BB, 3, false /* in-mem */); 9041 case X86::PCMPISTRM128MEM: 9042 case X86::VPCMPISTRM128MEM: 9043 return EmitPCMP(MI, BB, 3, true /* in-mem */); 9044 case X86::PCMPESTRM128REG: 9045 case X86::VPCMPESTRM128REG: 9046 return EmitPCMP(MI, BB, 5, false /* in mem */); 9047 case X86::PCMPESTRM128MEM: 9048 case X86::VPCMPESTRM128MEM: 9049 return EmitPCMP(MI, BB, 5, true /* in mem */); 9050 9051 // Atomic Lowering. 9052 case X86::ATOMAND32: 9053 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9054 X86::AND32ri, X86::MOV32rm, 9055 X86::LCMPXCHG32, 9056 X86::NOT32r, X86::EAX, 9057 X86::GR32RegisterClass); 9058 case X86::ATOMOR32: 9059 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 9060 X86::OR32ri, X86::MOV32rm, 9061 X86::LCMPXCHG32, 9062 X86::NOT32r, X86::EAX, 9063 X86::GR32RegisterClass); 9064 case X86::ATOMXOR32: 9065 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 9066 X86::XOR32ri, X86::MOV32rm, 9067 X86::LCMPXCHG32, 9068 X86::NOT32r, X86::EAX, 9069 X86::GR32RegisterClass); 9070 case X86::ATOMNAND32: 9071 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9072 X86::AND32ri, X86::MOV32rm, 9073 X86::LCMPXCHG32, 9074 X86::NOT32r, X86::EAX, 9075 X86::GR32RegisterClass, true); 9076 case X86::ATOMMIN32: 9077 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 9078 case X86::ATOMMAX32: 9079 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 9080 case X86::ATOMUMIN32: 9081 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 9082 case X86::ATOMUMAX32: 9083 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 9084 9085 case X86::ATOMAND16: 9086 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9087 X86::AND16ri, X86::MOV16rm, 9088 X86::LCMPXCHG16, 9089 X86::NOT16r, X86::AX, 9090 X86::GR16RegisterClass); 9091 case X86::ATOMOR16: 9092 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 9093 X86::OR16ri, X86::MOV16rm, 9094 X86::LCMPXCHG16, 9095 X86::NOT16r, X86::AX, 9096 X86::GR16RegisterClass); 9097 case X86::ATOMXOR16: 9098 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 9099 X86::XOR16ri, X86::MOV16rm, 9100 X86::LCMPXCHG16, 9101 X86::NOT16r, X86::AX, 9102 X86::GR16RegisterClass); 9103 case X86::ATOMNAND16: 9104 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9105 X86::AND16ri, X86::MOV16rm, 9106 X86::LCMPXCHG16, 9107 X86::NOT16r, X86::AX, 9108 X86::GR16RegisterClass, true); 9109 case X86::ATOMMIN16: 9110 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 9111 case X86::ATOMMAX16: 9112 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 9113 case X86::ATOMUMIN16: 9114 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 9115 case X86::ATOMUMAX16: 9116 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 9117 9118 case X86::ATOMAND8: 9119 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9120 X86::AND8ri, X86::MOV8rm, 9121 X86::LCMPXCHG8, 9122 X86::NOT8r, X86::AL, 9123 X86::GR8RegisterClass); 9124 case X86::ATOMOR8: 9125 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 9126 X86::OR8ri, X86::MOV8rm, 9127 X86::LCMPXCHG8, 9128 X86::NOT8r, X86::AL, 9129 X86::GR8RegisterClass); 9130 case X86::ATOMXOR8: 9131 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 9132 X86::XOR8ri, X86::MOV8rm, 9133 X86::LCMPXCHG8, 9134 X86::NOT8r, X86::AL, 9135 X86::GR8RegisterClass); 9136 case X86::ATOMNAND8: 9137 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9138 X86::AND8ri, X86::MOV8rm, 9139 X86::LCMPXCHG8, 9140 X86::NOT8r, X86::AL, 9141 X86::GR8RegisterClass, true); 9142 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 9143 // This group is for 64-bit host. 9144 case X86::ATOMAND64: 9145 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9146 X86::AND64ri32, X86::MOV64rm, 9147 X86::LCMPXCHG64, 9148 X86::NOT64r, X86::RAX, 9149 X86::GR64RegisterClass); 9150 case X86::ATOMOR64: 9151 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 9152 X86::OR64ri32, X86::MOV64rm, 9153 X86::LCMPXCHG64, 9154 X86::NOT64r, X86::RAX, 9155 X86::GR64RegisterClass); 9156 case X86::ATOMXOR64: 9157 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 9158 X86::XOR64ri32, X86::MOV64rm, 9159 X86::LCMPXCHG64, 9160 X86::NOT64r, X86::RAX, 9161 X86::GR64RegisterClass); 9162 case X86::ATOMNAND64: 9163 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9164 X86::AND64ri32, X86::MOV64rm, 9165 X86::LCMPXCHG64, 9166 X86::NOT64r, X86::RAX, 9167 X86::GR64RegisterClass, true); 9168 case X86::ATOMMIN64: 9169 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 9170 case X86::ATOMMAX64: 9171 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 9172 case X86::ATOMUMIN64: 9173 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 9174 case X86::ATOMUMAX64: 9175 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 9176 9177 // This group does 64-bit operations on a 32-bit host. 9178 case X86::ATOMAND6432: 9179 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9180 X86::AND32rr, X86::AND32rr, 9181 X86::AND32ri, X86::AND32ri, 9182 false); 9183 case X86::ATOMOR6432: 9184 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9185 X86::OR32rr, X86::OR32rr, 9186 X86::OR32ri, X86::OR32ri, 9187 false); 9188 case X86::ATOMXOR6432: 9189 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9190 X86::XOR32rr, X86::XOR32rr, 9191 X86::XOR32ri, X86::XOR32ri, 9192 false); 9193 case X86::ATOMNAND6432: 9194 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9195 X86::AND32rr, X86::AND32rr, 9196 X86::AND32ri, X86::AND32ri, 9197 true); 9198 case X86::ATOMADD6432: 9199 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9200 X86::ADD32rr, X86::ADC32rr, 9201 X86::ADD32ri, X86::ADC32ri, 9202 false); 9203 case X86::ATOMSUB6432: 9204 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9205 X86::SUB32rr, X86::SBB32rr, 9206 X86::SUB32ri, X86::SBB32ri, 9207 false); 9208 case X86::ATOMSWAP6432: 9209 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9210 X86::MOV32rr, X86::MOV32rr, 9211 X86::MOV32ri, X86::MOV32ri, 9212 false); 9213 case X86::VASTART_SAVE_XMM_REGS: 9214 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 9215 } 9216} 9217 9218//===----------------------------------------------------------------------===// 9219// X86 Optimization Hooks 9220//===----------------------------------------------------------------------===// 9221 9222void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 9223 const APInt &Mask, 9224 APInt &KnownZero, 9225 APInt &KnownOne, 9226 const SelectionDAG &DAG, 9227 unsigned Depth) const { 9228 unsigned Opc = Op.getOpcode(); 9229 assert((Opc >= ISD::BUILTIN_OP_END || 9230 Opc == ISD::INTRINSIC_WO_CHAIN || 9231 Opc == ISD::INTRINSIC_W_CHAIN || 9232 Opc == ISD::INTRINSIC_VOID) && 9233 "Should use MaskedValueIsZero if you don't know whether Op" 9234 " is a target node!"); 9235 9236 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 9237 switch (Opc) { 9238 default: break; 9239 case X86ISD::ADD: 9240 case X86ISD::SUB: 9241 case X86ISD::SMUL: 9242 case X86ISD::UMUL: 9243 case X86ISD::INC: 9244 case X86ISD::DEC: 9245 case X86ISD::OR: 9246 case X86ISD::XOR: 9247 case X86ISD::AND: 9248 // These nodes' second result is a boolean. 9249 if (Op.getResNo() == 0) 9250 break; 9251 // Fallthrough 9252 case X86ISD::SETCC: 9253 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 9254 Mask.getBitWidth() - 1); 9255 break; 9256 } 9257} 9258 9259/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 9260/// node is a GlobalAddress + offset. 9261bool X86TargetLowering::isGAPlusOffset(SDNode *N, 9262 const GlobalValue* &GA, 9263 int64_t &Offset) const { 9264 if (N->getOpcode() == X86ISD::Wrapper) { 9265 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 9266 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 9267 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 9268 return true; 9269 } 9270 } 9271 return TargetLowering::isGAPlusOffset(N, GA, Offset); 9272} 9273 9274/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 9275/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 9276/// if the load addresses are consecutive, non-overlapping, and in the right 9277/// order. 9278static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 9279 const TargetLowering &TLI) { 9280 DebugLoc dl = N->getDebugLoc(); 9281 EVT VT = N->getValueType(0); 9282 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 9283 9284 if (VT.getSizeInBits() != 128) 9285 return SDValue(); 9286 9287 SmallVector<SDValue, 16> Elts; 9288 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 9289 Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); 9290 9291 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 9292} 9293 9294/// PerformShuffleCombine - Detect vector gather/scatter index generation 9295/// and convert it from being a bunch of shuffles and extracts to a simple 9296/// store and scalar loads to extract the elements. 9297static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 9298 const TargetLowering &TLI) { 9299 SDValue InputVector = N->getOperand(0); 9300 9301 // Only operate on vectors of 4 elements, where the alternative shuffling 9302 // gets to be more expensive. 9303 if (InputVector.getValueType() != MVT::v4i32) 9304 return SDValue(); 9305 9306 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 9307 // single use which is a sign-extend or zero-extend, and all elements are 9308 // used. 9309 SmallVector<SDNode *, 4> Uses; 9310 unsigned ExtractedElements = 0; 9311 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 9312 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 9313 if (UI.getUse().getResNo() != InputVector.getResNo()) 9314 return SDValue(); 9315 9316 SDNode *Extract = *UI; 9317 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9318 return SDValue(); 9319 9320 if (Extract->getValueType(0) != MVT::i32) 9321 return SDValue(); 9322 if (!Extract->hasOneUse()) 9323 return SDValue(); 9324 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 9325 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 9326 return SDValue(); 9327 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 9328 return SDValue(); 9329 9330 // Record which element was extracted. 9331 ExtractedElements |= 9332 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 9333 9334 Uses.push_back(Extract); 9335 } 9336 9337 // If not all the elements were used, this may not be worthwhile. 9338 if (ExtractedElements != 15) 9339 return SDValue(); 9340 9341 // Ok, we've now decided to do the transformation. 9342 DebugLoc dl = InputVector.getDebugLoc(); 9343 9344 // Store the value to a temporary stack slot. 9345 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 9346 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 9347 0, false, false, 0); 9348 9349 // Replace each use (extract) with a load of the appropriate element. 9350 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 9351 UE = Uses.end(); UI != UE; ++UI) { 9352 SDNode *Extract = *UI; 9353 9354 // Compute the element's address. 9355 SDValue Idx = Extract->getOperand(1); 9356 unsigned EltSize = 9357 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 9358 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 9359 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 9360 9361 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 9362 OffsetVal, StackPtr); 9363 9364 // Load the scalar. 9365 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 9366 ScalarAddr, NULL, 0, false, false, 0); 9367 9368 // Replace the exact with the load. 9369 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 9370 } 9371 9372 // The replacement was made in place; don't return anything. 9373 return SDValue(); 9374} 9375 9376/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 9377static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 9378 const X86Subtarget *Subtarget) { 9379 DebugLoc DL = N->getDebugLoc(); 9380 SDValue Cond = N->getOperand(0); 9381 // Get the LHS/RHS of the select. 9382 SDValue LHS = N->getOperand(1); 9383 SDValue RHS = N->getOperand(2); 9384 9385 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 9386 // instructions match the semantics of the common C idiom x<y?x:y but not 9387 // x<=y?x:y, because of how they handle negative zero (which can be 9388 // ignored in unsafe-math mode). 9389 if (Subtarget->hasSSE2() && 9390 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 9391 Cond.getOpcode() == ISD::SETCC) { 9392 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9393 9394 unsigned Opcode = 0; 9395 // Check for x CC y ? x : y. 9396 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 9397 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 9398 switch (CC) { 9399 default: break; 9400 case ISD::SETULT: 9401 // Converting this to a min would handle NaNs incorrectly, and swapping 9402 // the operands would cause it to handle comparisons between positive 9403 // and negative zero incorrectly. 9404 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9405 if (!UnsafeFPMath && 9406 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9407 break; 9408 std::swap(LHS, RHS); 9409 } 9410 Opcode = X86ISD::FMIN; 9411 break; 9412 case ISD::SETOLE: 9413 // Converting this to a min would handle comparisons between positive 9414 // and negative zero incorrectly. 9415 if (!UnsafeFPMath && 9416 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 9417 break; 9418 Opcode = X86ISD::FMIN; 9419 break; 9420 case ISD::SETULE: 9421 // Converting this to a min would handle both negative zeros and NaNs 9422 // incorrectly, but we can swap the operands to fix both. 9423 std::swap(LHS, RHS); 9424 case ISD::SETOLT: 9425 case ISD::SETLT: 9426 case ISD::SETLE: 9427 Opcode = X86ISD::FMIN; 9428 break; 9429 9430 case ISD::SETOGE: 9431 // Converting this to a max would handle comparisons between positive 9432 // and negative zero incorrectly. 9433 if (!UnsafeFPMath && 9434 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 9435 break; 9436 Opcode = X86ISD::FMAX; 9437 break; 9438 case ISD::SETUGT: 9439 // Converting this to a max would handle NaNs incorrectly, and swapping 9440 // the operands would cause it to handle comparisons between positive 9441 // and negative zero incorrectly. 9442 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9443 if (!UnsafeFPMath && 9444 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9445 break; 9446 std::swap(LHS, RHS); 9447 } 9448 Opcode = X86ISD::FMAX; 9449 break; 9450 case ISD::SETUGE: 9451 // Converting this to a max would handle both negative zeros and NaNs 9452 // incorrectly, but we can swap the operands to fix both. 9453 std::swap(LHS, RHS); 9454 case ISD::SETOGT: 9455 case ISD::SETGT: 9456 case ISD::SETGE: 9457 Opcode = X86ISD::FMAX; 9458 break; 9459 } 9460 // Check for x CC y ? y : x -- a min/max with reversed arms. 9461 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 9462 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 9463 switch (CC) { 9464 default: break; 9465 case ISD::SETOGE: 9466 // Converting this to a min would handle comparisons between positive 9467 // and negative zero incorrectly, and swapping the operands would 9468 // cause it to handle NaNs incorrectly. 9469 if (!UnsafeFPMath && 9470 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 9471 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9472 break; 9473 std::swap(LHS, RHS); 9474 } 9475 Opcode = X86ISD::FMIN; 9476 break; 9477 case ISD::SETUGT: 9478 // Converting this to a min would handle NaNs incorrectly. 9479 if (!UnsafeFPMath && 9480 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9481 break; 9482 Opcode = X86ISD::FMIN; 9483 break; 9484 case ISD::SETUGE: 9485 // Converting this to a min would handle both negative zeros and NaNs 9486 // incorrectly, but we can swap the operands to fix both. 9487 std::swap(LHS, RHS); 9488 case ISD::SETOGT: 9489 case ISD::SETGT: 9490 case ISD::SETGE: 9491 Opcode = X86ISD::FMIN; 9492 break; 9493 9494 case ISD::SETULT: 9495 // Converting this to a max would handle NaNs incorrectly. 9496 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9497 break; 9498 Opcode = X86ISD::FMAX; 9499 break; 9500 case ISD::SETOLE: 9501 // Converting this to a max would handle comparisons between positive 9502 // and negative zero incorrectly, and swapping the operands would 9503 // cause it to handle NaNs incorrectly. 9504 if (!UnsafeFPMath && 9505 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9506 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9507 break; 9508 std::swap(LHS, RHS); 9509 } 9510 Opcode = X86ISD::FMAX; 9511 break; 9512 case ISD::SETULE: 9513 // Converting this to a max would handle both negative zeros and NaNs 9514 // incorrectly, but we can swap the operands to fix both. 9515 std::swap(LHS, RHS); 9516 case ISD::SETOLT: 9517 case ISD::SETLT: 9518 case ISD::SETLE: 9519 Opcode = X86ISD::FMAX; 9520 break; 9521 } 9522 } 9523 9524 if (Opcode) 9525 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9526 } 9527 9528 // If this is a select between two integer constants, try to do some 9529 // optimizations. 9530 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9531 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9532 // Don't do this for crazy integer types. 9533 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9534 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9535 // so that TrueC (the true value) is larger than FalseC. 9536 bool NeedsCondInvert = false; 9537 9538 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9539 // Efficiently invertible. 9540 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9541 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9542 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9543 NeedsCondInvert = true; 9544 std::swap(TrueC, FalseC); 9545 } 9546 9547 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9548 if (FalseC->getAPIntValue() == 0 && 9549 TrueC->getAPIntValue().isPowerOf2()) { 9550 if (NeedsCondInvert) // Invert the condition if needed. 9551 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9552 DAG.getConstant(1, Cond.getValueType())); 9553 9554 // Zero extend the condition if needed. 9555 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9556 9557 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9558 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9559 DAG.getConstant(ShAmt, MVT::i8)); 9560 } 9561 9562 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9563 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9564 if (NeedsCondInvert) // Invert the condition if needed. 9565 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9566 DAG.getConstant(1, Cond.getValueType())); 9567 9568 // Zero extend the condition if needed. 9569 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9570 FalseC->getValueType(0), Cond); 9571 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9572 SDValue(FalseC, 0)); 9573 } 9574 9575 // Optimize cases that will turn into an LEA instruction. This requires 9576 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9577 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9578 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9579 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9580 9581 bool isFastMultiplier = false; 9582 if (Diff < 10) { 9583 switch ((unsigned char)Diff) { 9584 default: break; 9585 case 1: // result = add base, cond 9586 case 2: // result = lea base( , cond*2) 9587 case 3: // result = lea base(cond, cond*2) 9588 case 4: // result = lea base( , cond*4) 9589 case 5: // result = lea base(cond, cond*4) 9590 case 8: // result = lea base( , cond*8) 9591 case 9: // result = lea base(cond, cond*8) 9592 isFastMultiplier = true; 9593 break; 9594 } 9595 } 9596 9597 if (isFastMultiplier) { 9598 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9599 if (NeedsCondInvert) // Invert the condition if needed. 9600 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9601 DAG.getConstant(1, Cond.getValueType())); 9602 9603 // Zero extend the condition if needed. 9604 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9605 Cond); 9606 // Scale the condition by the difference. 9607 if (Diff != 1) 9608 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9609 DAG.getConstant(Diff, Cond.getValueType())); 9610 9611 // Add the base if non-zero. 9612 if (FalseC->getAPIntValue() != 0) 9613 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9614 SDValue(FalseC, 0)); 9615 return Cond; 9616 } 9617 } 9618 } 9619 } 9620 9621 return SDValue(); 9622} 9623 9624/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9625static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9626 TargetLowering::DAGCombinerInfo &DCI) { 9627 DebugLoc DL = N->getDebugLoc(); 9628 9629 // If the flag operand isn't dead, don't touch this CMOV. 9630 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9631 return SDValue(); 9632 9633 // If this is a select between two integer constants, try to do some 9634 // optimizations. Note that the operands are ordered the opposite of SELECT 9635 // operands. 9636 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9637 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9638 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9639 // larger than FalseC (the false value). 9640 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9641 9642 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9643 CC = X86::GetOppositeBranchCondition(CC); 9644 std::swap(TrueC, FalseC); 9645 } 9646 9647 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9648 // This is efficient for any integer data type (including i8/i16) and 9649 // shift amount. 9650 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9651 SDValue Cond = N->getOperand(3); 9652 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9653 DAG.getConstant(CC, MVT::i8), Cond); 9654 9655 // Zero extend the condition if needed. 9656 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9657 9658 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9659 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9660 DAG.getConstant(ShAmt, MVT::i8)); 9661 if (N->getNumValues() == 2) // Dead flag value? 9662 return DCI.CombineTo(N, Cond, SDValue()); 9663 return Cond; 9664 } 9665 9666 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9667 // for any integer data type, including i8/i16. 9668 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9669 SDValue Cond = N->getOperand(3); 9670 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9671 DAG.getConstant(CC, MVT::i8), Cond); 9672 9673 // Zero extend the condition if needed. 9674 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9675 FalseC->getValueType(0), Cond); 9676 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9677 SDValue(FalseC, 0)); 9678 9679 if (N->getNumValues() == 2) // Dead flag value? 9680 return DCI.CombineTo(N, Cond, SDValue()); 9681 return Cond; 9682 } 9683 9684 // Optimize cases that will turn into an LEA instruction. This requires 9685 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9686 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9687 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9688 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9689 9690 bool isFastMultiplier = false; 9691 if (Diff < 10) { 9692 switch ((unsigned char)Diff) { 9693 default: break; 9694 case 1: // result = add base, cond 9695 case 2: // result = lea base( , cond*2) 9696 case 3: // result = lea base(cond, cond*2) 9697 case 4: // result = lea base( , cond*4) 9698 case 5: // result = lea base(cond, cond*4) 9699 case 8: // result = lea base( , cond*8) 9700 case 9: // result = lea base(cond, cond*8) 9701 isFastMultiplier = true; 9702 break; 9703 } 9704 } 9705 9706 if (isFastMultiplier) { 9707 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9708 SDValue Cond = N->getOperand(3); 9709 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9710 DAG.getConstant(CC, MVT::i8), Cond); 9711 // Zero extend the condition if needed. 9712 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9713 Cond); 9714 // Scale the condition by the difference. 9715 if (Diff != 1) 9716 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9717 DAG.getConstant(Diff, Cond.getValueType())); 9718 9719 // Add the base if non-zero. 9720 if (FalseC->getAPIntValue() != 0) 9721 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9722 SDValue(FalseC, 0)); 9723 if (N->getNumValues() == 2) // Dead flag value? 9724 return DCI.CombineTo(N, Cond, SDValue()); 9725 return Cond; 9726 } 9727 } 9728 } 9729 } 9730 return SDValue(); 9731} 9732 9733 9734/// PerformMulCombine - Optimize a single multiply with constant into two 9735/// in order to implement it with two cheaper instructions, e.g. 9736/// LEA + SHL, LEA + LEA. 9737static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9738 TargetLowering::DAGCombinerInfo &DCI) { 9739 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9740 return SDValue(); 9741 9742 EVT VT = N->getValueType(0); 9743 if (VT != MVT::i64) 9744 return SDValue(); 9745 9746 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9747 if (!C) 9748 return SDValue(); 9749 uint64_t MulAmt = C->getZExtValue(); 9750 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9751 return SDValue(); 9752 9753 uint64_t MulAmt1 = 0; 9754 uint64_t MulAmt2 = 0; 9755 if ((MulAmt % 9) == 0) { 9756 MulAmt1 = 9; 9757 MulAmt2 = MulAmt / 9; 9758 } else if ((MulAmt % 5) == 0) { 9759 MulAmt1 = 5; 9760 MulAmt2 = MulAmt / 5; 9761 } else if ((MulAmt % 3) == 0) { 9762 MulAmt1 = 3; 9763 MulAmt2 = MulAmt / 3; 9764 } 9765 if (MulAmt2 && 9766 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9767 DebugLoc DL = N->getDebugLoc(); 9768 9769 if (isPowerOf2_64(MulAmt2) && 9770 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9771 // If second multiplifer is pow2, issue it first. We want the multiply by 9772 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9773 // is an add. 9774 std::swap(MulAmt1, MulAmt2); 9775 9776 SDValue NewMul; 9777 if (isPowerOf2_64(MulAmt1)) 9778 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9779 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9780 else 9781 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9782 DAG.getConstant(MulAmt1, VT)); 9783 9784 if (isPowerOf2_64(MulAmt2)) 9785 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9786 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9787 else 9788 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9789 DAG.getConstant(MulAmt2, VT)); 9790 9791 // Do not add new nodes to DAG combiner worklist. 9792 DCI.CombineTo(N, NewMul, false); 9793 } 9794 return SDValue(); 9795} 9796 9797static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9798 SDValue N0 = N->getOperand(0); 9799 SDValue N1 = N->getOperand(1); 9800 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9801 EVT VT = N0.getValueType(); 9802 9803 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9804 // since the result of setcc_c is all zero's or all ones. 9805 if (N1C && N0.getOpcode() == ISD::AND && 9806 N0.getOperand(1).getOpcode() == ISD::Constant) { 9807 SDValue N00 = N0.getOperand(0); 9808 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9809 ((N00.getOpcode() == ISD::ANY_EXTEND || 9810 N00.getOpcode() == ISD::ZERO_EXTEND) && 9811 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9812 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9813 APInt ShAmt = N1C->getAPIntValue(); 9814 Mask = Mask.shl(ShAmt); 9815 if (Mask != 0) 9816 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9817 N00, DAG.getConstant(Mask, VT)); 9818 } 9819 } 9820 9821 return SDValue(); 9822} 9823 9824/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9825/// when possible. 9826static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9827 const X86Subtarget *Subtarget) { 9828 EVT VT = N->getValueType(0); 9829 if (!VT.isVector() && VT.isInteger() && 9830 N->getOpcode() == ISD::SHL) 9831 return PerformSHLCombine(N, DAG); 9832 9833 // On X86 with SSE2 support, we can transform this to a vector shift if 9834 // all elements are shifted by the same amount. We can't do this in legalize 9835 // because the a constant vector is typically transformed to a constant pool 9836 // so we have no knowledge of the shift amount. 9837 if (!Subtarget->hasSSE2()) 9838 return SDValue(); 9839 9840 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9841 return SDValue(); 9842 9843 SDValue ShAmtOp = N->getOperand(1); 9844 EVT EltVT = VT.getVectorElementType(); 9845 DebugLoc DL = N->getDebugLoc(); 9846 SDValue BaseShAmt = SDValue(); 9847 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9848 unsigned NumElts = VT.getVectorNumElements(); 9849 unsigned i = 0; 9850 for (; i != NumElts; ++i) { 9851 SDValue Arg = ShAmtOp.getOperand(i); 9852 if (Arg.getOpcode() == ISD::UNDEF) continue; 9853 BaseShAmt = Arg; 9854 break; 9855 } 9856 for (; i != NumElts; ++i) { 9857 SDValue Arg = ShAmtOp.getOperand(i); 9858 if (Arg.getOpcode() == ISD::UNDEF) continue; 9859 if (Arg != BaseShAmt) { 9860 return SDValue(); 9861 } 9862 } 9863 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9864 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9865 SDValue InVec = ShAmtOp.getOperand(0); 9866 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9867 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9868 unsigned i = 0; 9869 for (; i != NumElts; ++i) { 9870 SDValue Arg = InVec.getOperand(i); 9871 if (Arg.getOpcode() == ISD::UNDEF) continue; 9872 BaseShAmt = Arg; 9873 break; 9874 } 9875 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9876 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9877 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9878 if (C->getZExtValue() == SplatIdx) 9879 BaseShAmt = InVec.getOperand(1); 9880 } 9881 } 9882 if (BaseShAmt.getNode() == 0) 9883 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9884 DAG.getIntPtrConstant(0)); 9885 } else 9886 return SDValue(); 9887 9888 // The shift amount is an i32. 9889 if (EltVT.bitsGT(MVT::i32)) 9890 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9891 else if (EltVT.bitsLT(MVT::i32)) 9892 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9893 9894 // The shift amount is identical so we can do a vector shift. 9895 SDValue ValOp = N->getOperand(0); 9896 switch (N->getOpcode()) { 9897 default: 9898 llvm_unreachable("Unknown shift opcode!"); 9899 break; 9900 case ISD::SHL: 9901 if (VT == MVT::v2i64) 9902 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9903 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9904 ValOp, BaseShAmt); 9905 if (VT == MVT::v4i32) 9906 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9907 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9908 ValOp, BaseShAmt); 9909 if (VT == MVT::v8i16) 9910 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9911 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9912 ValOp, BaseShAmt); 9913 break; 9914 case ISD::SRA: 9915 if (VT == MVT::v4i32) 9916 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9917 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9918 ValOp, BaseShAmt); 9919 if (VT == MVT::v8i16) 9920 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9921 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9922 ValOp, BaseShAmt); 9923 break; 9924 case ISD::SRL: 9925 if (VT == MVT::v2i64) 9926 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9927 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9928 ValOp, BaseShAmt); 9929 if (VT == MVT::v4i32) 9930 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9931 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9932 ValOp, BaseShAmt); 9933 if (VT == MVT::v8i16) 9934 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9935 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9936 ValOp, BaseShAmt); 9937 break; 9938 } 9939 return SDValue(); 9940} 9941 9942static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9943 TargetLowering::DAGCombinerInfo &DCI, 9944 const X86Subtarget *Subtarget) { 9945 if (DCI.isBeforeLegalizeOps()) 9946 return SDValue(); 9947 9948 EVT VT = N->getValueType(0); 9949 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 9950 return SDValue(); 9951 9952 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9953 SDValue N0 = N->getOperand(0); 9954 SDValue N1 = N->getOperand(1); 9955 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9956 std::swap(N0, N1); 9957 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9958 return SDValue(); 9959 if (!N0.hasOneUse() || !N1.hasOneUse()) 9960 return SDValue(); 9961 9962 SDValue ShAmt0 = N0.getOperand(1); 9963 if (ShAmt0.getValueType() != MVT::i8) 9964 return SDValue(); 9965 SDValue ShAmt1 = N1.getOperand(1); 9966 if (ShAmt1.getValueType() != MVT::i8) 9967 return SDValue(); 9968 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9969 ShAmt0 = ShAmt0.getOperand(0); 9970 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9971 ShAmt1 = ShAmt1.getOperand(0); 9972 9973 DebugLoc DL = N->getDebugLoc(); 9974 unsigned Opc = X86ISD::SHLD; 9975 SDValue Op0 = N0.getOperand(0); 9976 SDValue Op1 = N1.getOperand(0); 9977 if (ShAmt0.getOpcode() == ISD::SUB) { 9978 Opc = X86ISD::SHRD; 9979 std::swap(Op0, Op1); 9980 std::swap(ShAmt0, ShAmt1); 9981 } 9982 9983 unsigned Bits = VT.getSizeInBits(); 9984 if (ShAmt1.getOpcode() == ISD::SUB) { 9985 SDValue Sum = ShAmt1.getOperand(0); 9986 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9987 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 9988 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 9989 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 9990 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 9991 return DAG.getNode(Opc, DL, VT, 9992 Op0, Op1, 9993 DAG.getNode(ISD::TRUNCATE, DL, 9994 MVT::i8, ShAmt0)); 9995 } 9996 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9997 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9998 if (ShAmt0C && 9999 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 10000 return DAG.getNode(Opc, DL, VT, 10001 N0.getOperand(0), N1.getOperand(0), 10002 DAG.getNode(ISD::TRUNCATE, DL, 10003 MVT::i8, ShAmt0)); 10004 } 10005 10006 return SDValue(); 10007} 10008 10009/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 10010static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 10011 const X86Subtarget *Subtarget) { 10012 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 10013 // the FP state in cases where an emms may be missing. 10014 // A preferable solution to the general problem is to figure out the right 10015 // places to insert EMMS. This qualifies as a quick hack. 10016 10017 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 10018 StoreSDNode *St = cast<StoreSDNode>(N); 10019 EVT VT = St->getValue().getValueType(); 10020 if (VT.getSizeInBits() != 64) 10021 return SDValue(); 10022 10023 const Function *F = DAG.getMachineFunction().getFunction(); 10024 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 10025 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 10026 && Subtarget->hasSSE2(); 10027 if ((VT.isVector() || 10028 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 10029 isa<LoadSDNode>(St->getValue()) && 10030 !cast<LoadSDNode>(St->getValue())->isVolatile() && 10031 St->getChain().hasOneUse() && !St->isVolatile()) { 10032 SDNode* LdVal = St->getValue().getNode(); 10033 LoadSDNode *Ld = 0; 10034 int TokenFactorIndex = -1; 10035 SmallVector<SDValue, 8> Ops; 10036 SDNode* ChainVal = St->getChain().getNode(); 10037 // Must be a store of a load. We currently handle two cases: the load 10038 // is a direct child, and it's under an intervening TokenFactor. It is 10039 // possible to dig deeper under nested TokenFactors. 10040 if (ChainVal == LdVal) 10041 Ld = cast<LoadSDNode>(St->getChain()); 10042 else if (St->getValue().hasOneUse() && 10043 ChainVal->getOpcode() == ISD::TokenFactor) { 10044 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 10045 if (ChainVal->getOperand(i).getNode() == LdVal) { 10046 TokenFactorIndex = i; 10047 Ld = cast<LoadSDNode>(St->getValue()); 10048 } else 10049 Ops.push_back(ChainVal->getOperand(i)); 10050 } 10051 } 10052 10053 if (!Ld || !ISD::isNormalLoad(Ld)) 10054 return SDValue(); 10055 10056 // If this is not the MMX case, i.e. we are just turning i64 load/store 10057 // into f64 load/store, avoid the transformation if there are multiple 10058 // uses of the loaded value. 10059 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 10060 return SDValue(); 10061 10062 DebugLoc LdDL = Ld->getDebugLoc(); 10063 DebugLoc StDL = N->getDebugLoc(); 10064 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 10065 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 10066 // pair instead. 10067 if (Subtarget->is64Bit() || F64IsLegal) { 10068 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 10069 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 10070 Ld->getBasePtr(), Ld->getSrcValue(), 10071 Ld->getSrcValueOffset(), Ld->isVolatile(), 10072 Ld->isNonTemporal(), Ld->getAlignment()); 10073 SDValue NewChain = NewLd.getValue(1); 10074 if (TokenFactorIndex != -1) { 10075 Ops.push_back(NewChain); 10076 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10077 Ops.size()); 10078 } 10079 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 10080 St->getSrcValue(), St->getSrcValueOffset(), 10081 St->isVolatile(), St->isNonTemporal(), 10082 St->getAlignment()); 10083 } 10084 10085 // Otherwise, lower to two pairs of 32-bit loads / stores. 10086 SDValue LoAddr = Ld->getBasePtr(); 10087 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 10088 DAG.getConstant(4, MVT::i32)); 10089 10090 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 10091 Ld->getSrcValue(), Ld->getSrcValueOffset(), 10092 Ld->isVolatile(), Ld->isNonTemporal(), 10093 Ld->getAlignment()); 10094 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 10095 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 10096 Ld->isVolatile(), Ld->isNonTemporal(), 10097 MinAlign(Ld->getAlignment(), 4)); 10098 10099 SDValue NewChain = LoLd.getValue(1); 10100 if (TokenFactorIndex != -1) { 10101 Ops.push_back(LoLd); 10102 Ops.push_back(HiLd); 10103 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10104 Ops.size()); 10105 } 10106 10107 LoAddr = St->getBasePtr(); 10108 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 10109 DAG.getConstant(4, MVT::i32)); 10110 10111 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 10112 St->getSrcValue(), St->getSrcValueOffset(), 10113 St->isVolatile(), St->isNonTemporal(), 10114 St->getAlignment()); 10115 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 10116 St->getSrcValue(), 10117 St->getSrcValueOffset() + 4, 10118 St->isVolatile(), 10119 St->isNonTemporal(), 10120 MinAlign(St->getAlignment(), 4)); 10121 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 10122 } 10123 return SDValue(); 10124} 10125 10126/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 10127/// X86ISD::FXOR nodes. 10128static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 10129 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 10130 // F[X]OR(0.0, x) -> x 10131 // F[X]OR(x, 0.0) -> x 10132 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10133 if (C->getValueAPF().isPosZero()) 10134 return N->getOperand(1); 10135 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10136 if (C->getValueAPF().isPosZero()) 10137 return N->getOperand(0); 10138 return SDValue(); 10139} 10140 10141/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 10142static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 10143 // FAND(0.0, x) -> 0.0 10144 // FAND(x, 0.0) -> 0.0 10145 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10146 if (C->getValueAPF().isPosZero()) 10147 return N->getOperand(0); 10148 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10149 if (C->getValueAPF().isPosZero()) 10150 return N->getOperand(1); 10151 return SDValue(); 10152} 10153 10154static SDValue PerformBTCombine(SDNode *N, 10155 SelectionDAG &DAG, 10156 TargetLowering::DAGCombinerInfo &DCI) { 10157 // BT ignores high bits in the bit index operand. 10158 SDValue Op1 = N->getOperand(1); 10159 if (Op1.hasOneUse()) { 10160 unsigned BitWidth = Op1.getValueSizeInBits(); 10161 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 10162 APInt KnownZero, KnownOne; 10163 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 10164 !DCI.isBeforeLegalizeOps()); 10165 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10166 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 10167 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 10168 DCI.CommitTargetLoweringOpt(TLO); 10169 } 10170 return SDValue(); 10171} 10172 10173static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 10174 SDValue Op = N->getOperand(0); 10175 if (Op.getOpcode() == ISD::BIT_CONVERT) 10176 Op = Op.getOperand(0); 10177 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 10178 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 10179 VT.getVectorElementType().getSizeInBits() == 10180 OpVT.getVectorElementType().getSizeInBits()) { 10181 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 10182 } 10183 return SDValue(); 10184} 10185 10186static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 10187 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 10188 // (and (i32 x86isd::setcc_carry), 1) 10189 // This eliminates the zext. This transformation is necessary because 10190 // ISD::SETCC is always legalized to i8. 10191 DebugLoc dl = N->getDebugLoc(); 10192 SDValue N0 = N->getOperand(0); 10193 EVT VT = N->getValueType(0); 10194 if (N0.getOpcode() == ISD::AND && 10195 N0.hasOneUse() && 10196 N0.getOperand(0).hasOneUse()) { 10197 SDValue N00 = N0.getOperand(0); 10198 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 10199 return SDValue(); 10200 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 10201 if (!C || C->getZExtValue() != 1) 10202 return SDValue(); 10203 return DAG.getNode(ISD::AND, dl, VT, 10204 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 10205 N00.getOperand(0), N00.getOperand(1)), 10206 DAG.getConstant(1, VT)); 10207 } 10208 10209 return SDValue(); 10210} 10211 10212SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 10213 DAGCombinerInfo &DCI) const { 10214 SelectionDAG &DAG = DCI.DAG; 10215 switch (N->getOpcode()) { 10216 default: break; 10217 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 10218 case ISD::EXTRACT_VECTOR_ELT: 10219 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 10220 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 10221 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 10222 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 10223 case ISD::SHL: 10224 case ISD::SRA: 10225 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 10226 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 10227 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 10228 case X86ISD::FXOR: 10229 case X86ISD::FOR: return PerformFORCombine(N, DAG); 10230 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 10231 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 10232 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 10233 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 10234 } 10235 10236 return SDValue(); 10237} 10238 10239/// isTypeDesirableForOp - Return true if the target has native support for 10240/// the specified value type and it is 'desirable' to use the type for the 10241/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 10242/// instruction encodings are longer and some i16 instructions are slow. 10243bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 10244 if (!isTypeLegal(VT)) 10245 return false; 10246 if (VT != MVT::i16) 10247 return true; 10248 10249 switch (Opc) { 10250 default: 10251 return true; 10252 case ISD::LOAD: 10253 case ISD::SIGN_EXTEND: 10254 case ISD::ZERO_EXTEND: 10255 case ISD::ANY_EXTEND: 10256 case ISD::SHL: 10257 case ISD::SRL: 10258 case ISD::SUB: 10259 case ISD::ADD: 10260 case ISD::MUL: 10261 case ISD::AND: 10262 case ISD::OR: 10263 case ISD::XOR: 10264 return false; 10265 } 10266} 10267 10268static bool MayFoldLoad(SDValue Op) { 10269 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 10270} 10271 10272static bool MayFoldIntoStore(SDValue Op) { 10273 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 10274} 10275 10276/// IsDesirableToPromoteOp - This method query the target whether it is 10277/// beneficial for dag combiner to promote the specified node. If true, it 10278/// should return the desired promotion type by reference. 10279bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 10280 EVT VT = Op.getValueType(); 10281 if (VT != MVT::i16) 10282 return false; 10283 10284 bool Promote = false; 10285 bool Commute = false; 10286 switch (Op.getOpcode()) { 10287 default: break; 10288 case ISD::LOAD: { 10289 LoadSDNode *LD = cast<LoadSDNode>(Op); 10290 // If the non-extending load has a single use and it's not live out, then it 10291 // might be folded. 10292 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 10293 Op.hasOneUse()*/) { 10294 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 10295 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 10296 // The only case where we'd want to promote LOAD (rather then it being 10297 // promoted as an operand is when it's only use is liveout. 10298 if (UI->getOpcode() != ISD::CopyToReg) 10299 return false; 10300 } 10301 } 10302 Promote = true; 10303 break; 10304 } 10305 case ISD::SIGN_EXTEND: 10306 case ISD::ZERO_EXTEND: 10307 case ISD::ANY_EXTEND: 10308 Promote = true; 10309 break; 10310 case ISD::SHL: 10311 case ISD::SRL: { 10312 SDValue N0 = Op.getOperand(0); 10313 // Look out for (store (shl (load), x)). 10314 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 10315 return false; 10316 Promote = true; 10317 break; 10318 } 10319 case ISD::ADD: 10320 case ISD::MUL: 10321 case ISD::AND: 10322 case ISD::OR: 10323 case ISD::XOR: 10324 Commute = true; 10325 // fallthrough 10326 case ISD::SUB: { 10327 SDValue N0 = Op.getOperand(0); 10328 SDValue N1 = Op.getOperand(1); 10329 if (!Commute && MayFoldLoad(N1)) 10330 return false; 10331 // Avoid disabling potential load folding opportunities. 10332 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 10333 return false; 10334 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 10335 return false; 10336 Promote = true; 10337 } 10338 } 10339 10340 PVT = MVT::i32; 10341 return Promote; 10342} 10343 10344//===----------------------------------------------------------------------===// 10345// X86 Inline Assembly Support 10346//===----------------------------------------------------------------------===// 10347 10348static bool LowerToBSwap(CallInst *CI) { 10349 // FIXME: this should verify that we are targetting a 486 or better. If not, 10350 // we will turn this bswap into something that will be lowered to logical ops 10351 // instead of emitting the bswap asm. For now, we don't support 486 or lower 10352 // so don't worry about this. 10353 10354 // Verify this is a simple bswap. 10355 if (CI->getNumArgOperands() != 1 || 10356 CI->getType() != CI->getArgOperand(0)->getType() || 10357 !CI->getType()->isIntegerTy()) 10358 return false; 10359 10360 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 10361 if (!Ty || Ty->getBitWidth() % 16 != 0) 10362 return false; 10363 10364 // Okay, we can do this xform, do so now. 10365 const Type *Tys[] = { Ty }; 10366 Module *M = CI->getParent()->getParent()->getParent(); 10367 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 10368 10369 Value *Op = CI->getArgOperand(0); 10370 Op = CallInst::Create(Int, Op, CI->getName(), CI); 10371 10372 CI->replaceAllUsesWith(Op); 10373 CI->eraseFromParent(); 10374 return true; 10375} 10376 10377bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 10378 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 10379 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 10380 10381 std::string AsmStr = IA->getAsmString(); 10382 10383 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 10384 SmallVector<StringRef, 4> AsmPieces; 10385 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 10386 10387 switch (AsmPieces.size()) { 10388 default: return false; 10389 case 1: 10390 AsmStr = AsmPieces[0]; 10391 AsmPieces.clear(); 10392 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 10393 10394 // bswap $0 10395 if (AsmPieces.size() == 2 && 10396 (AsmPieces[0] == "bswap" || 10397 AsmPieces[0] == "bswapq" || 10398 AsmPieces[0] == "bswapl") && 10399 (AsmPieces[1] == "$0" || 10400 AsmPieces[1] == "${0:q}")) { 10401 // No need to check constraints, nothing other than the equivalent of 10402 // "=r,0" would be valid here. 10403 return LowerToBSwap(CI); 10404 } 10405 // rorw $$8, ${0:w} --> llvm.bswap.i16 10406 if (CI->getType()->isIntegerTy(16) && 10407 AsmPieces.size() == 3 && 10408 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 10409 AsmPieces[1] == "$$8," && 10410 AsmPieces[2] == "${0:w}" && 10411 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 10412 AsmPieces.clear(); 10413 const std::string &Constraints = IA->getConstraintString(); 10414 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 10415 std::sort(AsmPieces.begin(), AsmPieces.end()); 10416 if (AsmPieces.size() == 4 && 10417 AsmPieces[0] == "~{cc}" && 10418 AsmPieces[1] == "~{dirflag}" && 10419 AsmPieces[2] == "~{flags}" && 10420 AsmPieces[3] == "~{fpsr}") { 10421 return LowerToBSwap(CI); 10422 } 10423 } 10424 break; 10425 case 3: 10426 if (CI->getType()->isIntegerTy(64) && 10427 Constraints.size() >= 2 && 10428 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10429 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10430 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10431 SmallVector<StringRef, 4> Words; 10432 SplitString(AsmPieces[0], Words, " \t"); 10433 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10434 Words.clear(); 10435 SplitString(AsmPieces[1], Words, " \t"); 10436 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10437 Words.clear(); 10438 SplitString(AsmPieces[2], Words, " \t,"); 10439 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10440 Words[2] == "%edx") { 10441 return LowerToBSwap(CI); 10442 } 10443 } 10444 } 10445 } 10446 break; 10447 } 10448 return false; 10449} 10450 10451 10452 10453/// getConstraintType - Given a constraint letter, return the type of 10454/// constraint it is for this target. 10455X86TargetLowering::ConstraintType 10456X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10457 if (Constraint.size() == 1) { 10458 switch (Constraint[0]) { 10459 case 'A': 10460 return C_Register; 10461 case 'f': 10462 case 'r': 10463 case 'R': 10464 case 'l': 10465 case 'q': 10466 case 'Q': 10467 case 'x': 10468 case 'y': 10469 case 'Y': 10470 return C_RegisterClass; 10471 case 'e': 10472 case 'Z': 10473 return C_Other; 10474 default: 10475 break; 10476 } 10477 } 10478 return TargetLowering::getConstraintType(Constraint); 10479} 10480 10481/// LowerXConstraint - try to replace an X constraint, which matches anything, 10482/// with another that has more specific requirements based on the type of the 10483/// corresponding operand. 10484const char *X86TargetLowering:: 10485LowerXConstraint(EVT ConstraintVT) const { 10486 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10487 // 'f' like normal targets. 10488 if (ConstraintVT.isFloatingPoint()) { 10489 if (Subtarget->hasSSE2()) 10490 return "Y"; 10491 if (Subtarget->hasSSE1()) 10492 return "x"; 10493 } 10494 10495 return TargetLowering::LowerXConstraint(ConstraintVT); 10496} 10497 10498/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10499/// vector. If it is invalid, don't add anything to Ops. 10500void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10501 char Constraint, 10502 std::vector<SDValue>&Ops, 10503 SelectionDAG &DAG) const { 10504 SDValue Result(0, 0); 10505 10506 switch (Constraint) { 10507 default: break; 10508 case 'I': 10509 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10510 if (C->getZExtValue() <= 31) { 10511 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10512 break; 10513 } 10514 } 10515 return; 10516 case 'J': 10517 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10518 if (C->getZExtValue() <= 63) { 10519 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10520 break; 10521 } 10522 } 10523 return; 10524 case 'K': 10525 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10526 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10527 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10528 break; 10529 } 10530 } 10531 return; 10532 case 'N': 10533 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10534 if (C->getZExtValue() <= 255) { 10535 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10536 break; 10537 } 10538 } 10539 return; 10540 case 'e': { 10541 // 32-bit signed value 10542 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10543 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10544 C->getSExtValue())) { 10545 // Widen to 64 bits here to get it sign extended. 10546 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10547 break; 10548 } 10549 // FIXME gcc accepts some relocatable values here too, but only in certain 10550 // memory models; it's complicated. 10551 } 10552 return; 10553 } 10554 case 'Z': { 10555 // 32-bit unsigned value 10556 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10557 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10558 C->getZExtValue())) { 10559 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10560 break; 10561 } 10562 } 10563 // FIXME gcc accepts some relocatable values here too, but only in certain 10564 // memory models; it's complicated. 10565 return; 10566 } 10567 case 'i': { 10568 // Literal immediates are always ok. 10569 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10570 // Widen to 64 bits here to get it sign extended. 10571 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10572 break; 10573 } 10574 10575 // In any sort of PIC mode addresses need to be computed at runtime by 10576 // adding in a register or some sort of table lookup. These can't 10577 // be used as immediates. 10578 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 10579 return; 10580 10581 // If we are in non-pic codegen mode, we allow the address of a global (with 10582 // an optional displacement) to be used with 'i'. 10583 GlobalAddressSDNode *GA = 0; 10584 int64_t Offset = 0; 10585 10586 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10587 while (1) { 10588 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10589 Offset += GA->getOffset(); 10590 break; 10591 } else if (Op.getOpcode() == ISD::ADD) { 10592 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10593 Offset += C->getZExtValue(); 10594 Op = Op.getOperand(0); 10595 continue; 10596 } 10597 } else if (Op.getOpcode() == ISD::SUB) { 10598 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10599 Offset += -C->getZExtValue(); 10600 Op = Op.getOperand(0); 10601 continue; 10602 } 10603 } 10604 10605 // Otherwise, this isn't something we can handle, reject it. 10606 return; 10607 } 10608 10609 const GlobalValue *GV = GA->getGlobal(); 10610 // If we require an extra load to get this address, as in PIC mode, we 10611 // can't accept it. 10612 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 10613 getTargetMachine()))) 10614 return; 10615 10616 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 10617 GA->getValueType(0), Offset); 10618 break; 10619 } 10620 } 10621 10622 if (Result.getNode()) { 10623 Ops.push_back(Result); 10624 return; 10625 } 10626 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 10627} 10628 10629std::vector<unsigned> X86TargetLowering:: 10630getRegClassForInlineAsmConstraint(const std::string &Constraint, 10631 EVT VT) const { 10632 if (Constraint.size() == 1) { 10633 // FIXME: not handling fp-stack yet! 10634 switch (Constraint[0]) { // GCC X86 Constraint Letters 10635 default: break; // Unknown constraint letter 10636 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10637 if (Subtarget->is64Bit()) { 10638 if (VT == MVT::i32) 10639 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10640 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10641 X86::R10D,X86::R11D,X86::R12D, 10642 X86::R13D,X86::R14D,X86::R15D, 10643 X86::EBP, X86::ESP, 0); 10644 else if (VT == MVT::i16) 10645 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10646 X86::SI, X86::DI, X86::R8W,X86::R9W, 10647 X86::R10W,X86::R11W,X86::R12W, 10648 X86::R13W,X86::R14W,X86::R15W, 10649 X86::BP, X86::SP, 0); 10650 else if (VT == MVT::i8) 10651 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10652 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10653 X86::R10B,X86::R11B,X86::R12B, 10654 X86::R13B,X86::R14B,X86::R15B, 10655 X86::BPL, X86::SPL, 0); 10656 10657 else if (VT == MVT::i64) 10658 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10659 X86::RSI, X86::RDI, X86::R8, X86::R9, 10660 X86::R10, X86::R11, X86::R12, 10661 X86::R13, X86::R14, X86::R15, 10662 X86::RBP, X86::RSP, 0); 10663 10664 break; 10665 } 10666 // 32-bit fallthrough 10667 case 'Q': // Q_REGS 10668 if (VT == MVT::i32) 10669 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10670 else if (VT == MVT::i16) 10671 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10672 else if (VT == MVT::i8) 10673 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10674 else if (VT == MVT::i64) 10675 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10676 break; 10677 } 10678 } 10679 10680 return std::vector<unsigned>(); 10681} 10682 10683std::pair<unsigned, const TargetRegisterClass*> 10684X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10685 EVT VT) const { 10686 // First, see if this is a constraint that directly corresponds to an LLVM 10687 // register class. 10688 if (Constraint.size() == 1) { 10689 // GCC Constraint Letters 10690 switch (Constraint[0]) { 10691 default: break; 10692 case 'r': // GENERAL_REGS 10693 case 'l': // INDEX_REGS 10694 if (VT == MVT::i8) 10695 return std::make_pair(0U, X86::GR8RegisterClass); 10696 if (VT == MVT::i16) 10697 return std::make_pair(0U, X86::GR16RegisterClass); 10698 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10699 return std::make_pair(0U, X86::GR32RegisterClass); 10700 return std::make_pair(0U, X86::GR64RegisterClass); 10701 case 'R': // LEGACY_REGS 10702 if (VT == MVT::i8) 10703 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10704 if (VT == MVT::i16) 10705 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10706 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10707 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10708 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10709 case 'f': // FP Stack registers. 10710 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10711 // value to the correct fpstack register class. 10712 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10713 return std::make_pair(0U, X86::RFP32RegisterClass); 10714 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10715 return std::make_pair(0U, X86::RFP64RegisterClass); 10716 return std::make_pair(0U, X86::RFP80RegisterClass); 10717 case 'y': // MMX_REGS if MMX allowed. 10718 if (!Subtarget->hasMMX()) break; 10719 return std::make_pair(0U, X86::VR64RegisterClass); 10720 case 'Y': // SSE_REGS if SSE2 allowed 10721 if (!Subtarget->hasSSE2()) break; 10722 // FALL THROUGH. 10723 case 'x': // SSE_REGS if SSE1 allowed 10724 if (!Subtarget->hasSSE1()) break; 10725 10726 switch (VT.getSimpleVT().SimpleTy) { 10727 default: break; 10728 // Scalar SSE types. 10729 case MVT::f32: 10730 case MVT::i32: 10731 return std::make_pair(0U, X86::FR32RegisterClass); 10732 case MVT::f64: 10733 case MVT::i64: 10734 return std::make_pair(0U, X86::FR64RegisterClass); 10735 // Vector types. 10736 case MVT::v16i8: 10737 case MVT::v8i16: 10738 case MVT::v4i32: 10739 case MVT::v2i64: 10740 case MVT::v4f32: 10741 case MVT::v2f64: 10742 return std::make_pair(0U, X86::VR128RegisterClass); 10743 } 10744 break; 10745 } 10746 } 10747 10748 // Use the default implementation in TargetLowering to convert the register 10749 // constraint into a member of a register class. 10750 std::pair<unsigned, const TargetRegisterClass*> Res; 10751 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10752 10753 // Not found as a standard register? 10754 if (Res.second == 0) { 10755 // Map st(0) -> st(7) -> ST0 10756 if (Constraint.size() == 7 && Constraint[0] == '{' && 10757 tolower(Constraint[1]) == 's' && 10758 tolower(Constraint[2]) == 't' && 10759 Constraint[3] == '(' && 10760 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10761 Constraint[5] == ')' && 10762 Constraint[6] == '}') { 10763 10764 Res.first = X86::ST0+Constraint[4]-'0'; 10765 Res.second = X86::RFP80RegisterClass; 10766 return Res; 10767 } 10768 10769 // GCC allows "st(0)" to be called just plain "st". 10770 if (StringRef("{st}").equals_lower(Constraint)) { 10771 Res.first = X86::ST0; 10772 Res.second = X86::RFP80RegisterClass; 10773 return Res; 10774 } 10775 10776 // flags -> EFLAGS 10777 if (StringRef("{flags}").equals_lower(Constraint)) { 10778 Res.first = X86::EFLAGS; 10779 Res.second = X86::CCRRegisterClass; 10780 return Res; 10781 } 10782 10783 // 'A' means EAX + EDX. 10784 if (Constraint == "A") { 10785 Res.first = X86::EAX; 10786 Res.second = X86::GR32_ADRegisterClass; 10787 return Res; 10788 } 10789 return Res; 10790 } 10791 10792 // Otherwise, check to see if this is a register class of the wrong value 10793 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10794 // turn into {ax},{dx}. 10795 if (Res.second->hasType(VT)) 10796 return Res; // Correct type already, nothing to do. 10797 10798 // All of the single-register GCC register classes map their values onto 10799 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10800 // really want an 8-bit or 32-bit register, map to the appropriate register 10801 // class and return the appropriate register. 10802 if (Res.second == X86::GR16RegisterClass) { 10803 if (VT == MVT::i8) { 10804 unsigned DestReg = 0; 10805 switch (Res.first) { 10806 default: break; 10807 case X86::AX: DestReg = X86::AL; break; 10808 case X86::DX: DestReg = X86::DL; break; 10809 case X86::CX: DestReg = X86::CL; break; 10810 case X86::BX: DestReg = X86::BL; break; 10811 } 10812 if (DestReg) { 10813 Res.first = DestReg; 10814 Res.second = X86::GR8RegisterClass; 10815 } 10816 } else if (VT == MVT::i32) { 10817 unsigned DestReg = 0; 10818 switch (Res.first) { 10819 default: break; 10820 case X86::AX: DestReg = X86::EAX; break; 10821 case X86::DX: DestReg = X86::EDX; break; 10822 case X86::CX: DestReg = X86::ECX; break; 10823 case X86::BX: DestReg = X86::EBX; break; 10824 case X86::SI: DestReg = X86::ESI; break; 10825 case X86::DI: DestReg = X86::EDI; break; 10826 case X86::BP: DestReg = X86::EBP; break; 10827 case X86::SP: DestReg = X86::ESP; break; 10828 } 10829 if (DestReg) { 10830 Res.first = DestReg; 10831 Res.second = X86::GR32RegisterClass; 10832 } 10833 } else if (VT == MVT::i64) { 10834 unsigned DestReg = 0; 10835 switch (Res.first) { 10836 default: break; 10837 case X86::AX: DestReg = X86::RAX; break; 10838 case X86::DX: DestReg = X86::RDX; break; 10839 case X86::CX: DestReg = X86::RCX; break; 10840 case X86::BX: DestReg = X86::RBX; break; 10841 case X86::SI: DestReg = X86::RSI; break; 10842 case X86::DI: DestReg = X86::RDI; break; 10843 case X86::BP: DestReg = X86::RBP; break; 10844 case X86::SP: DestReg = X86::RSP; break; 10845 } 10846 if (DestReg) { 10847 Res.first = DestReg; 10848 Res.second = X86::GR64RegisterClass; 10849 } 10850 } 10851 } else if (Res.second == X86::FR32RegisterClass || 10852 Res.second == X86::FR64RegisterClass || 10853 Res.second == X86::VR128RegisterClass) { 10854 // Handle references to XMM physical registers that got mapped into the 10855 // wrong class. This can happen with constraints like {xmm0} where the 10856 // target independent register mapper will just pick the first match it can 10857 // find, ignoring the required type. 10858 if (VT == MVT::f32) 10859 Res.second = X86::FR32RegisterClass; 10860 else if (VT == MVT::f64) 10861 Res.second = X86::FR64RegisterClass; 10862 else if (X86::VR128RegisterClass->hasType(VT)) 10863 Res.second = X86::VR128RegisterClass; 10864 } 10865 10866 return Res; 10867} 10868